Files
convert-ui/lib/converters/pandocService.ts
Sebastian Krüger b899989b3e feat: add comprehensive PDF support
- Add jsPDF for PDF generation from text/Markdown/HTML
- Add PDF.js for PDF text extraction (read PDFs)
- Support PDF → Text/Markdown conversions
- Support Markdown/HTML/Text → PDF conversions
- Implement page-by-page PDF text extraction
- Automatic pagination and formatting for generated PDFs

Supported PDF operations:
- Extract text from PDF files (all pages)
- Convert PDF to Markdown or plain text
- Create formatted PDFs from Markdown, HTML, or plain text
- Automatic text wrapping and page breaks

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 11:13:09 +01:00

345 lines
8.6 KiB
TypeScript

import { marked } from 'marked';
import TurndownService from 'turndown';
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
import {
pdfToText,
pdfToMarkdown,
markdownToPDF,
htmlToPDF,
plainTextToPDF,
} from './pdfService';
// Import DOMPurify only on client side
let DOMPurify: any;
if (typeof window !== 'undefined') {
DOMPurify = require('dompurify');
}
/**
* Convert document using Markdown/HTML converters
* Note: This uses lightweight JS libraries instead of Pandoc WASM (which isn't widely available)
*/
export async function convertWithPandoc(
file: File,
outputFormat: string,
options: ConversionOptions = {},
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
// Read file content as text
const text = await file.text();
if (onProgress) onProgress(30);
// Detect input format from file extension or content
const inputExt = file.name.split('.').pop()?.toLowerCase();
let result: string;
if (onProgress) onProgress(50);
// Handle PDF conversions
if (inputExt === 'pdf') {
// PDF input
if (outputFormat === 'txt') {
return await pdfToText(file, onProgress);
} else if (outputFormat === 'md' || outputFormat === 'markdown') {
return await pdfToMarkdown(file, onProgress);
} else {
throw new Error(`Conversion from PDF to ${outputFormat} not supported`);
}
}
// Handle conversions TO PDF
if (outputFormat === 'pdf') {
if (inputExt === 'md' || inputExt === 'markdown') {
return await markdownToPDF(file, onProgress);
} else if (inputExt === 'html' || inputExt === 'htm') {
return await htmlToPDF(file, onProgress);
} else if (inputExt === 'txt') {
return await plainTextToPDF(file, onProgress);
} else {
throw new Error(`Conversion from ${inputExt} to PDF not supported`);
}
}
// Perform conversion based on input and output formats
if (inputExt === 'md' || inputExt === 'markdown') {
// Markdown input
if (outputFormat === 'html') {
result = await markdownToHtml(text);
} else if (outputFormat === 'txt') {
result = markdownToText(text);
} else {
throw new Error(`Conversion from Markdown to ${outputFormat} not supported`);
}
} else if (inputExt === 'html' || inputExt === 'htm') {
// HTML input
if (outputFormat === 'md' || outputFormat === 'markdown') {
result = await htmlToMarkdown(text);
} else if (outputFormat === 'txt') {
result = htmlToText(text);
} else {
throw new Error(`Conversion from HTML to ${outputFormat} not supported`);
}
} else if (inputExt === 'txt') {
// Plain text input
if (outputFormat === 'md' || outputFormat === 'markdown') {
result = textToMarkdown(text);
} else if (outputFormat === 'html') {
result = textToHtml(text);
} else {
throw new Error(`Conversion from TXT to ${outputFormat} not supported`);
}
} else {
throw new Error(`Input format ${inputExt} not supported`);
}
if (onProgress) onProgress(90);
// Create blob from result
const blob = new Blob([result], { type: getMimeType(outputFormat) });
if (onProgress) onProgress(100);
const duration = Date.now() - startTime;
return {
success: true,
blob,
duration,
};
} catch (error) {
console.error('[Document Converter] Conversion error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Unknown conversion error',
duration: Date.now() - startTime,
};
}
}
/**
* Convert Markdown to HTML
*/
async function markdownToHtml(markdown: string): Promise<string> {
// Configure marked options
marked.setOptions({
gfm: true, // GitHub Flavored Markdown
breaks: true, // Convert \n to <br>
});
const html = await marked.parse(markdown);
// Sanitize HTML for security
const sanitized = DOMPurify.sanitize(html);
// Wrap in basic HTML document
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
pre {
background: #f4f4f4;
border: 1px solid #ddd;
border-radius: 4px;
padding: 1rem;
overflow-x: auto;
}
code {
background: #f4f4f4;
padding: 0.2rem 0.4rem;
border-radius: 3px;
font-family: 'Courier New', monospace;
}
blockquote {
border-left: 4px solid #ddd;
margin: 1rem 0;
padding-left: 1rem;
color: #666;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1rem 0;
}
th, td {
border: 1px solid #ddd;
padding: 0.5rem;
text-align: left;
}
th {
background: #f4f4f4;
}
</style>
</head>
<body>
${sanitized}
</body>
</html>`;
}
/**
* Convert HTML to Markdown
*/
async function htmlToMarkdown(html: string): Promise<string> {
// Sanitize HTML first
const sanitized = DOMPurify.sanitize(html);
// Configure TurndownService
const turndownService = new TurndownService({
headingStyle: 'atx', // Use # for headings
codeBlockStyle: 'fenced', // Use ``` for code blocks
bulletListMarker: '-', // Use - for bullet lists
});
const markdown = turndownService.turndown(sanitized);
return markdown;
}
/**
* Convert Markdown to plain text (strip formatting)
*/
function markdownToText(markdown: string): string {
// Remove markdown syntax
let text = markdown
// Remove headers
.replace(/^#{1,6}\s+/gm, '')
// Remove bold/italic
.replace(/(\*\*|__)(.*?)\1/g, '$2')
.replace(/(\*|_)(.*?)\1/g, '$2')
// Remove links
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
// Remove images
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
// Remove code blocks
.replace(/```[\s\S]*?```/g, '')
// Remove inline code
.replace(/`([^`]+)`/g, '$1')
// Remove blockquotes
.replace(/^>\s+/gm, '')
// Remove horizontal rules
.replace(/^-{3,}$/gm, '')
// Clean up multiple newlines
.replace(/\n{3,}/g, '\n\n');
return text.trim();
}
/**
* Convert HTML to plain text
*/
function htmlToText(html: string): string {
// Sanitize HTML first
const sanitized = DOMPurify.sanitize(html, { ALLOWED_TAGS: [] });
// Clean up whitespace
return sanitized
.replace(/\s+/g, ' ')
.trim();
}
/**
* Convert plain text to Markdown
*/
function textToMarkdown(text: string): string {
// Add basic markdown formatting
// Treat lines as paragraphs
return text
.split('\n\n')
.filter(p => p.trim())
.join('\n\n');
}
/**
* Convert plain text to HTML
*/
function textToHtml(text: string): string {
// Escape HTML entities
const escaped = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
// Convert newlines to paragraphs
const paragraphs = escaped
.split('\n\n')
.filter(p => p.trim())
.map(p => ` <p>${p.replace(/\n/g, '<br>')}</p>`)
.join('\n');
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
</style>
</head>
<body>
${paragraphs}
</body>
</html>`;
}
/**
* Get MIME type for output format
*/
function getMimeType(format: string): string {
const mimeTypes: Record<string, string> = {
html: 'text/html',
htm: 'text/html',
md: 'text/markdown',
markdown: 'text/markdown',
txt: 'text/plain',
};
return mimeTypes[format.toLowerCase()] || 'text/plain';
}
/**
* Convert Markdown to HTML (convenience function)
*/
export async function markdownToHtmlFile(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
return convertWithPandoc(file, 'html', {}, onProgress);
}
/**
* Convert HTML to Markdown (convenience function)
*/
export async function htmlToMarkdownFile(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
return convertWithPandoc(file, 'md', {}, onProgress);
}