feat: add comprehensive PDF support
- Add jsPDF for PDF generation from text/Markdown/HTML - Add PDF.js for PDF text extraction (read PDFs) - Support PDF → Text/Markdown conversions - Support Markdown/HTML/Text → PDF conversions - Implement page-by-page PDF text extraction - Automatic pagination and formatting for generated PDFs Supported PDF operations: - Extract text from PDF files (all pages) - Convert PDF to Markdown or plain text - Create formatted PDFs from Markdown, HTML, or plain text - Automatic text wrapping and page breaks 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
334
lib/converters/pdfService.ts
Normal file
334
lib/converters/pdfService.ts
Normal file
@@ -0,0 +1,334 @@
|
||||
import { jsPDF } from 'jspdf';
|
||||
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
||||
|
||||
/**
|
||||
* Extract text from PDF file
|
||||
*/
|
||||
export async function extractTextFromPDF(file: File, onProgress?: ProgressCallback): Promise<string> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Dynamically import pdfjs-dist (client-side only)
|
||||
const pdfjsLib = await import('pdfjs-dist');
|
||||
|
||||
// Set worker source
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Read file as ArrayBuffer
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Load PDF document
|
||||
const loadingTask = pdfjsLib.getDocument({ data: arrayBuffer });
|
||||
const pdf = await loadingTask.promise;
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
const numPages = pdf.numPages;
|
||||
let fullText = '';
|
||||
|
||||
// Extract text from each page
|
||||
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const textContent = await page.getTextContent();
|
||||
|
||||
// Combine text items
|
||||
const pageText = textContent.items
|
||||
.map((item: any) => item.str)
|
||||
.join(' ');
|
||||
|
||||
fullText += pageText + '\n\n';
|
||||
|
||||
// Update progress
|
||||
if (onProgress) {
|
||||
const progress = 50 + (pageNum / numPages) * 40;
|
||||
onProgress(Math.round(progress));
|
||||
}
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return fullText.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert PDF to text
|
||||
*/
|
||||
export async function pdfToText(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const text = await extractTextFromPDF(file, onProgress);
|
||||
const blob = new Blob([text], { type: 'text/plain' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] PDF to text error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to extract text from PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert PDF to Markdown
|
||||
*/
|
||||
export async function pdfToMarkdown(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const text = await extractTextFromPDF(file, (progress) => {
|
||||
if (onProgress) onProgress(progress * 0.9); // Use 90% for extraction
|
||||
});
|
||||
|
||||
// Basic text to markdown conversion (paragraphs)
|
||||
const markdown = text
|
||||
.split('\n\n')
|
||||
.filter(p => p.trim())
|
||||
.join('\n\n');
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const blob = new Blob([markdown], { type: 'text/markdown' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] PDF to markdown error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert PDF to Markdown',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert text to PDF
|
||||
*/
|
||||
export async function textToPDF(
|
||||
text: string,
|
||||
filename: string = 'document.pdf',
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<Blob> {
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
const doc = new jsPDF({
|
||||
orientation: 'portrait',
|
||||
unit: 'mm',
|
||||
format: 'a4',
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(40);
|
||||
|
||||
// Set font and size
|
||||
doc.setFont('helvetica');
|
||||
doc.setFontSize(12);
|
||||
|
||||
// Page dimensions
|
||||
const pageWidth = doc.internal.pageSize.getWidth();
|
||||
const pageHeight = doc.internal.pageSize.getHeight();
|
||||
const margin = 20;
|
||||
const maxWidth = pageWidth - 2 * margin;
|
||||
const lineHeight = 7;
|
||||
let y = margin;
|
||||
|
||||
if (onProgress) onProgress(60);
|
||||
|
||||
// Split text into lines
|
||||
const lines = doc.splitTextToSize(text, maxWidth);
|
||||
|
||||
// Add lines to PDF
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
// Check if we need a new page
|
||||
if (y + lineHeight > pageHeight - margin) {
|
||||
doc.addPage();
|
||||
y = margin;
|
||||
}
|
||||
|
||||
doc.text(lines[i], margin, y);
|
||||
y += lineHeight;
|
||||
|
||||
// Update progress
|
||||
if (onProgress && i % 10 === 0) {
|
||||
const progress = 60 + (i / lines.length) * 30;
|
||||
onProgress(Math.round(progress));
|
||||
}
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(90);
|
||||
|
||||
// Generate PDF blob
|
||||
const pdfBlob = doc.output('blob');
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return pdfBlob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to PDF
|
||||
*/
|
||||
export async function markdownToPDF(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Read markdown content
|
||||
const markdown = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Import marked for markdown parsing
|
||||
const { marked } = await import('marked');
|
||||
|
||||
// Parse markdown to HTML
|
||||
const html = await marked.parse(markdown);
|
||||
|
||||
if (onProgress) onProgress(40);
|
||||
|
||||
// Strip HTML tags for plain text
|
||||
const text = html
|
||||
.replace(/<[^>]*>/g, '')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&');
|
||||
|
||||
if (onProgress) onProgress(60);
|
||||
|
||||
// Generate PDF
|
||||
const pdfBlob = await textToPDF(text, file.name.replace(/\.md$/, '.pdf'), (progress) => {
|
||||
if (onProgress) onProgress(60 + progress * 0.4);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob: pdfBlob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] Markdown to PDF error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert Markdown to PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to PDF
|
||||
*/
|
||||
export async function htmlToPDF(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Read HTML content
|
||||
const html = await file.text();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Strip HTML tags for plain text
|
||||
const text = html
|
||||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
||||
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Generate PDF
|
||||
const pdfBlob = await textToPDF(text, file.name.replace(/\.html?$/, '.pdf'), (progress) => {
|
||||
if (onProgress) onProgress(50 + progress * 0.5);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob: pdfBlob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] HTML to PDF error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert HTML to PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert plain text to PDF
|
||||
*/
|
||||
export async function plainTextToPDF(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const text = await file.text();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
const pdfBlob = await textToPDF(text, file.name.replace(/\.txt$/, '.pdf'), (progress) => {
|
||||
if (onProgress) onProgress(30 + progress * 0.7);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob: pdfBlob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] Text to PDF error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert text to PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user