Files
convert-ui/lib/converters/pdfService.ts
Sebastian Krüger b899989b3e feat: add comprehensive PDF support
- Add jsPDF for PDF generation from text/Markdown/HTML
- Add PDF.js for PDF text extraction (read PDFs)
- Support PDF → Text/Markdown conversions
- Support Markdown/HTML/Text → PDF conversions
- Implement page-by-page PDF text extraction
- Automatic pagination and formatting for generated PDFs

Supported PDF operations:
- Extract text from PDF files (all pages)
- Convert PDF to Markdown or plain text
- Create formatted PDFs from Markdown, HTML, or plain text
- Automatic text wrapping and page breaks

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 11:13:09 +01:00

335 lines
7.8 KiB
TypeScript

import { jsPDF } from 'jspdf';
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
/**
* Extract text from PDF file
*/
export async function extractTextFromPDF(file: File, onProgress?: ProgressCallback): Promise<string> {
if (onProgress) onProgress(10);
// Dynamically import pdfjs-dist (client-side only)
const pdfjsLib = await import('pdfjs-dist');
// Set worker source
pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
if (onProgress) onProgress(20);
// Read file as ArrayBuffer
const arrayBuffer = await file.arrayBuffer();
if (onProgress) onProgress(30);
// Load PDF document
const loadingTask = pdfjsLib.getDocument({ data: arrayBuffer });
const pdf = await loadingTask.promise;
if (onProgress) onProgress(50);
const numPages = pdf.numPages;
let fullText = '';
// Extract text from each page
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
// Combine text items
const pageText = textContent.items
.map((item: any) => item.str)
.join(' ');
fullText += pageText + '\n\n';
// Update progress
if (onProgress) {
const progress = 50 + (pageNum / numPages) * 40;
onProgress(Math.round(progress));
}
}
if (onProgress) onProgress(100);
return fullText.trim();
}
/**
* Convert PDF to text
*/
export async function pdfToText(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
const text = await extractTextFromPDF(file, onProgress);
const blob = new Blob([text], { type: 'text/plain' });
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[PDF Converter] PDF to text error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to extract text from PDF',
duration: Date.now() - startTime,
};
}
}
/**
* Convert PDF to Markdown
*/
export async function pdfToMarkdown(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
const text = await extractTextFromPDF(file, (progress) => {
if (onProgress) onProgress(progress * 0.9); // Use 90% for extraction
});
// Basic text to markdown conversion (paragraphs)
const markdown = text
.split('\n\n')
.filter(p => p.trim())
.join('\n\n');
if (onProgress) onProgress(100);
const blob = new Blob([markdown], { type: 'text/markdown' });
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[PDF Converter] PDF to markdown error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert PDF to Markdown',
duration: Date.now() - startTime,
};
}
}
/**
* Convert text to PDF
*/
export async function textToPDF(
text: string,
filename: string = 'document.pdf',
onProgress?: ProgressCallback
): Promise<Blob> {
if (onProgress) onProgress(20);
const doc = new jsPDF({
orientation: 'portrait',
unit: 'mm',
format: 'a4',
});
if (onProgress) onProgress(40);
// Set font and size
doc.setFont('helvetica');
doc.setFontSize(12);
// Page dimensions
const pageWidth = doc.internal.pageSize.getWidth();
const pageHeight = doc.internal.pageSize.getHeight();
const margin = 20;
const maxWidth = pageWidth - 2 * margin;
const lineHeight = 7;
let y = margin;
if (onProgress) onProgress(60);
// Split text into lines
const lines = doc.splitTextToSize(text, maxWidth);
// Add lines to PDF
for (let i = 0; i < lines.length; i++) {
// Check if we need a new page
if (y + lineHeight > pageHeight - margin) {
doc.addPage();
y = margin;
}
doc.text(lines[i], margin, y);
y += lineHeight;
// Update progress
if (onProgress && i % 10 === 0) {
const progress = 60 + (i / lines.length) * 30;
onProgress(Math.round(progress));
}
}
if (onProgress) onProgress(90);
// Generate PDF blob
const pdfBlob = doc.output('blob');
if (onProgress) onProgress(100);
return pdfBlob;
}
/**
* Convert Markdown to PDF
*/
export async function markdownToPDF(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
// Read markdown content
const markdown = await file.text();
if (onProgress) onProgress(20);
// Import marked for markdown parsing
const { marked } = await import('marked');
// Parse markdown to HTML
const html = await marked.parse(markdown);
if (onProgress) onProgress(40);
// Strip HTML tags for plain text
const text = html
.replace(/<[^>]*>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&');
if (onProgress) onProgress(60);
// Generate PDF
const pdfBlob = await textToPDF(text, file.name.replace(/\.md$/, '.pdf'), (progress) => {
if (onProgress) onProgress(60 + progress * 0.4);
});
return {
success: true,
blob: pdfBlob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[PDF Converter] Markdown to PDF error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert Markdown to PDF',
duration: Date.now() - startTime,
};
}
}
/**
* Convert HTML to PDF
*/
export async function htmlToPDF(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
// Read HTML content
const html = await file.text();
if (onProgress) onProgress(30);
// Strip HTML tags for plain text
const text = html
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
.replace(/<[^>]*>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/\s+/g, ' ')
.trim();
if (onProgress) onProgress(50);
// Generate PDF
const pdfBlob = await textToPDF(text, file.name.replace(/\.html?$/, '.pdf'), (progress) => {
if (onProgress) onProgress(50 + progress * 0.5);
});
return {
success: true,
blob: pdfBlob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[PDF Converter] HTML to PDF error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert HTML to PDF',
duration: Date.now() - startTime,
};
}
}
/**
* Convert plain text to PDF
*/
export async function plainTextToPDF(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
const text = await file.text();
if (onProgress) onProgress(30);
const pdfBlob = await textToPDF(text, file.name.replace(/\.txt$/, '.pdf'), (progress) => {
if (onProgress) onProgress(30 + progress * 0.7);
});
return {
success: true,
blob: pdfBlob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[PDF Converter] Text to PDF error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert text to PDF',
duration: Date.now() - startTime,
};
}
}