feat: add comprehensive PDF support

- Add jsPDF for PDF generation from text/Markdown/HTML - Add PDF.js for PDF text extraction (read PDFs) - Support PDF → Text/Markdown conversions - Support Markdown/HTML/Text → PDF conversions - Implement page-by-page PDF text extraction - Automatic pagination and formatting for generated PDFs Supported PDF operations: - Extract text from PDF files (all pages) - Convert PDF to Markdown or plain text - Create formatted PDFs from Markdown, HTML, or plain text - Automatic text wrapping and page breaks 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 11:13:09 +01:00
parent 9de639b138
commit b899989b3e
5 changed files with 658 additions and 2 deletions
--- a/lib/converters/pdfService.ts
+++ b/lib/converters/pdfService.ts
@@ -0,0 +1,334 @@
+import { jsPDF } from 'jspdf';
+import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
+
+/**
+ * Extract text from PDF file
+ */
+export async function extractTextFromPDF(file: File, onProgress?: ProgressCallback): Promise<string> {
+  if (onProgress) onProgress(10);
+
+  // Dynamically import pdfjs-dist (client-side only)
+  const pdfjsLib = await import('pdfjs-dist');
+
+  // Set worker source
+  pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
+
+  if (onProgress) onProgress(20);
+
+  // Read file as ArrayBuffer
+  const arrayBuffer = await file.arrayBuffer();
+
+  if (onProgress) onProgress(30);
+
+  // Load PDF document
+  const loadingTask = pdfjsLib.getDocument({ data: arrayBuffer });
+  const pdf = await loadingTask.promise;
+
+  if (onProgress) onProgress(50);
+
+  const numPages = pdf.numPages;
+  let fullText = '';
+
+  // Extract text from each page
+  for (let pageNum = 1; pageNum <= numPages; pageNum++) {
+    const page = await pdf.getPage(pageNum);
+    const textContent = await page.getTextContent();
+
+    // Combine text items
+    const pageText = textContent.items
+      .map((item: any) => item.str)
+      .join(' ');
+
+    fullText += pageText + '\n\n';
+
+    // Update progress
+    if (onProgress) {
+      const progress = 50 + (pageNum / numPages) * 40;
+      onProgress(Math.round(progress));
+    }
+  }
+
+  if (onProgress) onProgress(100);
+
+  return fullText.trim();
+}
+
+/**
+ * Convert PDF to text
+ */
+export async function pdfToText(
+  file: File,
+  onProgress?: ProgressCallback
+): Promise<ConversionResult> {
+  const startTime = Date.now();
+
+  try {
+    const text = await extractTextFromPDF(file, onProgress);
+    const blob = new Blob([text], { type: 'text/plain' });
+
+    return {
+      success: true,
+      blob,
+      duration: Date.now() - startTime,
+    };
+  } catch (error) {
+    console.error('[PDF Converter] PDF to text error:', error);
+
+    return {
+      success: false,
+      error: error instanceof Error ? error.message : 'Failed to extract text from PDF',
+      duration: Date.now() - startTime,
+    };
+  }
+}
+
+/**
+ * Convert PDF to Markdown
+ */
+export async function pdfToMarkdown(
+  file: File,
+  onProgress?: ProgressCallback
+): Promise<ConversionResult> {
+  const startTime = Date.now();
+
+  try {
+    const text = await extractTextFromPDF(file, (progress) => {
+      if (onProgress) onProgress(progress * 0.9); // Use 90% for extraction
+    });
+
+    // Basic text to markdown conversion (paragraphs)
+    const markdown = text
+      .split('\n\n')
+      .filter(p => p.trim())
+      .join('\n\n');
+
+    if (onProgress) onProgress(100);
+
+    const blob = new Blob([markdown], { type: 'text/markdown' });
+
+    return {
+      success: true,
+      blob,
+      duration: Date.now() - startTime,
+    };
+  } catch (error) {
+    console.error('[PDF Converter] PDF to markdown error:', error);
+
+    return {
+      success: false,
+      error: error instanceof Error ? error.message : 'Failed to convert PDF to Markdown',
+      duration: Date.now() - startTime,
+    };
+  }
+}
+
+/**
+ * Convert text to PDF
+ */
+export async function textToPDF(
+  text: string,
+  filename: string = 'document.pdf',
+  onProgress?: ProgressCallback
+): Promise<Blob> {
+  if (onProgress) onProgress(20);
+
+  const doc = new jsPDF({
+    orientation: 'portrait',
+    unit: 'mm',
+    format: 'a4',
+  });
+
+  if (onProgress) onProgress(40);
+
+  // Set font and size
+  doc.setFont('helvetica');
+  doc.setFontSize(12);
+
+  // Page dimensions
+  const pageWidth = doc.internal.pageSize.getWidth();
+  const pageHeight = doc.internal.pageSize.getHeight();
+  const margin = 20;
+  const maxWidth = pageWidth - 2 * margin;
+  const lineHeight = 7;
+  let y = margin;
+
+  if (onProgress) onProgress(60);
+
+  // Split text into lines
+  const lines = doc.splitTextToSize(text, maxWidth);
+
+  // Add lines to PDF
+  for (let i = 0; i < lines.length; i++) {
+    // Check if we need a new page
+    if (y + lineHeight > pageHeight - margin) {
+      doc.addPage();
+      y = margin;
+    }
+
+    doc.text(lines[i], margin, y);
+    y += lineHeight;
+
+    // Update progress
+    if (onProgress && i % 10 === 0) {
+      const progress = 60 + (i / lines.length) * 30;
+      onProgress(Math.round(progress));
+    }
+  }
+
+  if (onProgress) onProgress(90);
+
+  // Generate PDF blob
+  const pdfBlob = doc.output('blob');
+
+  if (onProgress) onProgress(100);
+
+  return pdfBlob;
+}
+
+/**
+ * Convert Markdown to PDF
+ */
+export async function markdownToPDF(
+  file: File,
+  onProgress?: ProgressCallback
+): Promise<ConversionResult> {
+  const startTime = Date.now();
+
+  try {
+    if (onProgress) onProgress(10);
+
+    // Read markdown content
+    const markdown = await file.text();
+
+    if (onProgress) onProgress(20);
+
+    // Import marked for markdown parsing
+    const { marked } = await import('marked');
+
+    // Parse markdown to HTML
+    const html = await marked.parse(markdown);
+
+    if (onProgress) onProgress(40);
+
+    // Strip HTML tags for plain text
+    const text = html
+      .replace(/<[^>]*>/g, '')
+      .replace(/&nbsp;/g, ' ')
+      .replace(/&quot;/g, '"')
+      .replace(/&apos;/g, "'")
+      .replace(/&lt;/g, '<')
+      .replace(/&gt;/g, '>')
+      .replace(/&amp;/g, '&');
+
+    if (onProgress) onProgress(60);
+
+    // Generate PDF
+    const pdfBlob = await textToPDF(text, file.name.replace(/\.md$/, '.pdf'), (progress) => {
+      if (onProgress) onProgress(60 + progress * 0.4);
+    });
+
+    return {
+      success: true,
+      blob: pdfBlob,
+      duration: Date.now() - startTime,
+    };
+  } catch (error) {
+    console.error('[PDF Converter] Markdown to PDF error:', error);
+
+    return {
+      success: false,
+      error: error instanceof Error ? error.message : 'Failed to convert Markdown to PDF',
+      duration: Date.now() - startTime,
+    };
+  }
+}
+
+/**
+ * Convert HTML to PDF
+ */
+export async function htmlToPDF(
+  file: File,
+  onProgress?: ProgressCallback
+): Promise<ConversionResult> {
+  const startTime = Date.now();
+
+  try {
+    if (onProgress) onProgress(10);
+
+    // Read HTML content
+    const html = await file.text();
+
+    if (onProgress) onProgress(30);
+
+    // Strip HTML tags for plain text
+    const text = html
+      .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
+      .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
+      .replace(/<[^>]*>/g, ' ')
+      .replace(/&nbsp;/g, ' ')
+      .replace(/&quot;/g, '"')
+      .replace(/&apos;/g, "'")
+      .replace(/&lt;/g, '<')
+      .replace(/&gt;/g, '>')
+      .replace(/&amp;/g, '&')
+      .replace(/\s+/g, ' ')
+      .trim();
+
+    if (onProgress) onProgress(50);
+
+    // Generate PDF
+    const pdfBlob = await textToPDF(text, file.name.replace(/\.html?$/, '.pdf'), (progress) => {
+      if (onProgress) onProgress(50 + progress * 0.5);
+    });
+
+    return {
+      success: true,
+      blob: pdfBlob,
+      duration: Date.now() - startTime,
+    };
+  } catch (error) {
+    console.error('[PDF Converter] HTML to PDF error:', error);
+
+    return {
+      success: false,
+      error: error instanceof Error ? error.message : 'Failed to convert HTML to PDF',
+      duration: Date.now() - startTime,
+    };
+  }
+}
+
+/**
+ * Convert plain text to PDF
+ */
+export async function plainTextToPDF(
+  file: File,
+  onProgress?: ProgressCallback
+): Promise<ConversionResult> {
+  const startTime = Date.now();
+
+  try {
+    if (onProgress) onProgress(10);
+
+    const text = await file.text();
+
+    if (onProgress) onProgress(30);
+
+    const pdfBlob = await textToPDF(text, file.name.replace(/\.txt$/, '.pdf'), (progress) => {
+      if (onProgress) onProgress(30 + progress * 0.7);
+    });
+
+    return {
+      success: true,
+      blob: pdfBlob,
+      duration: Date.now() - startTime,
+    };
+  } catch (error) {
+    console.error('[PDF Converter] Text to PDF error:', error);
+
+    return {
+      success: false,
+      error: error instanceof Error ? error.message : 'Failed to convert text to PDF',
+      duration: Date.now() - startTime,
+    };
+  }
+}