- Add jsPDF for PDF generation from text/Markdown/HTML - Add PDF.js for PDF text extraction (read PDFs) - Support PDF → Text/Markdown conversions - Support Markdown/HTML/Text → PDF conversions - Implement page-by-page PDF text extraction - Automatic pagination and formatting for generated PDFs Supported PDF operations: - Extract text from PDF files (all pages) - Convert PDF to Markdown or plain text - Create formatted PDFs from Markdown, HTML, or plain text - Automatic text wrapping and page breaks 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
335 lines
7.8 KiB
TypeScript
335 lines
7.8 KiB
TypeScript
import { jsPDF } from 'jspdf';
|
|
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
|
|
|
/**
|
|
* Extract text from PDF file
|
|
*/
|
|
export async function extractTextFromPDF(file: File, onProgress?: ProgressCallback): Promise<string> {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Dynamically import pdfjs-dist (client-side only)
|
|
const pdfjsLib = await import('pdfjs-dist');
|
|
|
|
// Set worker source
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
|
|
|
|
if (onProgress) onProgress(20);
|
|
|
|
// Read file as ArrayBuffer
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
|
|
if (onProgress) onProgress(30);
|
|
|
|
// Load PDF document
|
|
const loadingTask = pdfjsLib.getDocument({ data: arrayBuffer });
|
|
const pdf = await loadingTask.promise;
|
|
|
|
if (onProgress) onProgress(50);
|
|
|
|
const numPages = pdf.numPages;
|
|
let fullText = '';
|
|
|
|
// Extract text from each page
|
|
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
const page = await pdf.getPage(pageNum);
|
|
const textContent = await page.getTextContent();
|
|
|
|
// Combine text items
|
|
const pageText = textContent.items
|
|
.map((item: any) => item.str)
|
|
.join(' ');
|
|
|
|
fullText += pageText + '\n\n';
|
|
|
|
// Update progress
|
|
if (onProgress) {
|
|
const progress = 50 + (pageNum / numPages) * 40;
|
|
onProgress(Math.round(progress));
|
|
}
|
|
}
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
return fullText.trim();
|
|
}
|
|
|
|
/**
|
|
* Convert PDF to text
|
|
*/
|
|
export async function pdfToText(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const text = await extractTextFromPDF(file, onProgress);
|
|
const blob = new Blob([text], { type: 'text/plain' });
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[PDF Converter] PDF to text error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to extract text from PDF',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert PDF to Markdown
|
|
*/
|
|
export async function pdfToMarkdown(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const text = await extractTextFromPDF(file, (progress) => {
|
|
if (onProgress) onProgress(progress * 0.9); // Use 90% for extraction
|
|
});
|
|
|
|
// Basic text to markdown conversion (paragraphs)
|
|
const markdown = text
|
|
.split('\n\n')
|
|
.filter(p => p.trim())
|
|
.join('\n\n');
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
const blob = new Blob([markdown], { type: 'text/markdown' });
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[PDF Converter] PDF to markdown error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert PDF to Markdown',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert text to PDF
|
|
*/
|
|
export async function textToPDF(
|
|
text: string,
|
|
filename: string = 'document.pdf',
|
|
onProgress?: ProgressCallback
|
|
): Promise<Blob> {
|
|
if (onProgress) onProgress(20);
|
|
|
|
const doc = new jsPDF({
|
|
orientation: 'portrait',
|
|
unit: 'mm',
|
|
format: 'a4',
|
|
});
|
|
|
|
if (onProgress) onProgress(40);
|
|
|
|
// Set font and size
|
|
doc.setFont('helvetica');
|
|
doc.setFontSize(12);
|
|
|
|
// Page dimensions
|
|
const pageWidth = doc.internal.pageSize.getWidth();
|
|
const pageHeight = doc.internal.pageSize.getHeight();
|
|
const margin = 20;
|
|
const maxWidth = pageWidth - 2 * margin;
|
|
const lineHeight = 7;
|
|
let y = margin;
|
|
|
|
if (onProgress) onProgress(60);
|
|
|
|
// Split text into lines
|
|
const lines = doc.splitTextToSize(text, maxWidth);
|
|
|
|
// Add lines to PDF
|
|
for (let i = 0; i < lines.length; i++) {
|
|
// Check if we need a new page
|
|
if (y + lineHeight > pageHeight - margin) {
|
|
doc.addPage();
|
|
y = margin;
|
|
}
|
|
|
|
doc.text(lines[i], margin, y);
|
|
y += lineHeight;
|
|
|
|
// Update progress
|
|
if (onProgress && i % 10 === 0) {
|
|
const progress = 60 + (i / lines.length) * 30;
|
|
onProgress(Math.round(progress));
|
|
}
|
|
}
|
|
|
|
if (onProgress) onProgress(90);
|
|
|
|
// Generate PDF blob
|
|
const pdfBlob = doc.output('blob');
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
return pdfBlob;
|
|
}
|
|
|
|
/**
|
|
* Convert Markdown to PDF
|
|
*/
|
|
export async function markdownToPDF(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Read markdown content
|
|
const markdown = await file.text();
|
|
|
|
if (onProgress) onProgress(20);
|
|
|
|
// Import marked for markdown parsing
|
|
const { marked } = await import('marked');
|
|
|
|
// Parse markdown to HTML
|
|
const html = await marked.parse(markdown);
|
|
|
|
if (onProgress) onProgress(40);
|
|
|
|
// Strip HTML tags for plain text
|
|
const text = html
|
|
.replace(/<[^>]*>/g, '')
|
|
.replace(/ /g, ' ')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/&/g, '&');
|
|
|
|
if (onProgress) onProgress(60);
|
|
|
|
// Generate PDF
|
|
const pdfBlob = await textToPDF(text, file.name.replace(/\.md$/, '.pdf'), (progress) => {
|
|
if (onProgress) onProgress(60 + progress * 0.4);
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
blob: pdfBlob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[PDF Converter] Markdown to PDF error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert Markdown to PDF',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert HTML to PDF
|
|
*/
|
|
export async function htmlToPDF(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Read HTML content
|
|
const html = await file.text();
|
|
|
|
if (onProgress) onProgress(30);
|
|
|
|
// Strip HTML tags for plain text
|
|
const text = html
|
|
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
|
.replace(/<[^>]*>/g, ' ')
|
|
.replace(/ /g, ' ')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/&/g, '&')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
|
|
if (onProgress) onProgress(50);
|
|
|
|
// Generate PDF
|
|
const pdfBlob = await textToPDF(text, file.name.replace(/\.html?$/, '.pdf'), (progress) => {
|
|
if (onProgress) onProgress(50 + progress * 0.5);
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
blob: pdfBlob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[PDF Converter] HTML to PDF error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert HTML to PDF',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert plain text to PDF
|
|
*/
|
|
export async function plainTextToPDF(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
const text = await file.text();
|
|
|
|
if (onProgress) onProgress(30);
|
|
|
|
const pdfBlob = await textToPDF(text, file.name.replace(/\.txt$/, '.pdf'), (progress) => {
|
|
if (onProgress) onProgress(30 + progress * 0.7);
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
blob: pdfBlob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[PDF Converter] Text to PDF error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert text to PDF',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|