- Add jsPDF for PDF generation from text/Markdown/HTML - Add PDF.js for PDF text extraction (read PDFs) - Support PDF → Text/Markdown conversions - Support Markdown/HTML/Text → PDF conversions - Implement page-by-page PDF text extraction - Automatic pagination and formatting for generated PDFs Supported PDF operations: - Extract text from PDF files (all pages) - Convert PDF to Markdown or plain text - Create formatted PDFs from Markdown, HTML, or plain text - Automatic text wrapping and page breaks 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
345 lines
8.6 KiB
TypeScript
345 lines
8.6 KiB
TypeScript
import { marked } from 'marked';
|
|
import TurndownService from 'turndown';
|
|
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
|
import {
|
|
pdfToText,
|
|
pdfToMarkdown,
|
|
markdownToPDF,
|
|
htmlToPDF,
|
|
plainTextToPDF,
|
|
} from './pdfService';
|
|
|
|
// Import DOMPurify only on client side
|
|
let DOMPurify: any;
|
|
if (typeof window !== 'undefined') {
|
|
DOMPurify = require('dompurify');
|
|
}
|
|
|
|
/**
|
|
* Convert document using Markdown/HTML converters
|
|
* Note: This uses lightweight JS libraries instead of Pandoc WASM (which isn't widely available)
|
|
*/
|
|
export async function convertWithPandoc(
|
|
file: File,
|
|
outputFormat: string,
|
|
options: ConversionOptions = {},
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Read file content as text
|
|
const text = await file.text();
|
|
|
|
if (onProgress) onProgress(30);
|
|
|
|
// Detect input format from file extension or content
|
|
const inputExt = file.name.split('.').pop()?.toLowerCase();
|
|
let result: string;
|
|
|
|
if (onProgress) onProgress(50);
|
|
|
|
// Handle PDF conversions
|
|
if (inputExt === 'pdf') {
|
|
// PDF input
|
|
if (outputFormat === 'txt') {
|
|
return await pdfToText(file, onProgress);
|
|
} else if (outputFormat === 'md' || outputFormat === 'markdown') {
|
|
return await pdfToMarkdown(file, onProgress);
|
|
} else {
|
|
throw new Error(`Conversion from PDF to ${outputFormat} not supported`);
|
|
}
|
|
}
|
|
|
|
// Handle conversions TO PDF
|
|
if (outputFormat === 'pdf') {
|
|
if (inputExt === 'md' || inputExt === 'markdown') {
|
|
return await markdownToPDF(file, onProgress);
|
|
} else if (inputExt === 'html' || inputExt === 'htm') {
|
|
return await htmlToPDF(file, onProgress);
|
|
} else if (inputExt === 'txt') {
|
|
return await plainTextToPDF(file, onProgress);
|
|
} else {
|
|
throw new Error(`Conversion from ${inputExt} to PDF not supported`);
|
|
}
|
|
}
|
|
|
|
// Perform conversion based on input and output formats
|
|
if (inputExt === 'md' || inputExt === 'markdown') {
|
|
// Markdown input
|
|
if (outputFormat === 'html') {
|
|
result = await markdownToHtml(text);
|
|
} else if (outputFormat === 'txt') {
|
|
result = markdownToText(text);
|
|
} else {
|
|
throw new Error(`Conversion from Markdown to ${outputFormat} not supported`);
|
|
}
|
|
} else if (inputExt === 'html' || inputExt === 'htm') {
|
|
// HTML input
|
|
if (outputFormat === 'md' || outputFormat === 'markdown') {
|
|
result = await htmlToMarkdown(text);
|
|
} else if (outputFormat === 'txt') {
|
|
result = htmlToText(text);
|
|
} else {
|
|
throw new Error(`Conversion from HTML to ${outputFormat} not supported`);
|
|
}
|
|
} else if (inputExt === 'txt') {
|
|
// Plain text input
|
|
if (outputFormat === 'md' || outputFormat === 'markdown') {
|
|
result = textToMarkdown(text);
|
|
} else if (outputFormat === 'html') {
|
|
result = textToHtml(text);
|
|
} else {
|
|
throw new Error(`Conversion from TXT to ${outputFormat} not supported`);
|
|
}
|
|
} else {
|
|
throw new Error(`Input format ${inputExt} not supported`);
|
|
}
|
|
|
|
if (onProgress) onProgress(90);
|
|
|
|
// Create blob from result
|
|
const blob = new Blob([result], { type: getMimeType(outputFormat) });
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
const duration = Date.now() - startTime;
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration,
|
|
};
|
|
} catch (error) {
|
|
console.error('[Document Converter] Conversion error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Unknown conversion error',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert Markdown to HTML
|
|
*/
|
|
async function markdownToHtml(markdown: string): Promise<string> {
|
|
// Configure marked options
|
|
marked.setOptions({
|
|
gfm: true, // GitHub Flavored Markdown
|
|
breaks: true, // Convert \n to <br>
|
|
});
|
|
|
|
const html = await marked.parse(markdown);
|
|
|
|
// Sanitize HTML for security
|
|
const sanitized = DOMPurify.sanitize(html);
|
|
|
|
// Wrap in basic HTML document
|
|
return `<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Converted Document</title>
|
|
<style>
|
|
body {
|
|
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
line-height: 1.6;
|
|
max-width: 800px;
|
|
margin: 2rem auto;
|
|
padding: 0 1rem;
|
|
color: #333;
|
|
}
|
|
pre {
|
|
background: #f4f4f4;
|
|
border: 1px solid #ddd;
|
|
border-radius: 4px;
|
|
padding: 1rem;
|
|
overflow-x: auto;
|
|
}
|
|
code {
|
|
background: #f4f4f4;
|
|
padding: 0.2rem 0.4rem;
|
|
border-radius: 3px;
|
|
font-family: 'Courier New', monospace;
|
|
}
|
|
blockquote {
|
|
border-left: 4px solid #ddd;
|
|
margin: 1rem 0;
|
|
padding-left: 1rem;
|
|
color: #666;
|
|
}
|
|
table {
|
|
border-collapse: collapse;
|
|
width: 100%;
|
|
margin: 1rem 0;
|
|
}
|
|
th, td {
|
|
border: 1px solid #ddd;
|
|
padding: 0.5rem;
|
|
text-align: left;
|
|
}
|
|
th {
|
|
background: #f4f4f4;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
${sanitized}
|
|
</body>
|
|
</html>`;
|
|
}
|
|
|
|
/**
|
|
* Convert HTML to Markdown
|
|
*/
|
|
async function htmlToMarkdown(html: string): Promise<string> {
|
|
// Sanitize HTML first
|
|
const sanitized = DOMPurify.sanitize(html);
|
|
|
|
// Configure TurndownService
|
|
const turndownService = new TurndownService({
|
|
headingStyle: 'atx', // Use # for headings
|
|
codeBlockStyle: 'fenced', // Use ``` for code blocks
|
|
bulletListMarker: '-', // Use - for bullet lists
|
|
});
|
|
|
|
const markdown = turndownService.turndown(sanitized);
|
|
return markdown;
|
|
}
|
|
|
|
/**
|
|
* Convert Markdown to plain text (strip formatting)
|
|
*/
|
|
function markdownToText(markdown: string): string {
|
|
// Remove markdown syntax
|
|
let text = markdown
|
|
// Remove headers
|
|
.replace(/^#{1,6}\s+/gm, '')
|
|
// Remove bold/italic
|
|
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
|
.replace(/(\*|_)(.*?)\1/g, '$2')
|
|
// Remove links
|
|
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
// Remove images
|
|
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
|
|
// Remove code blocks
|
|
.replace(/```[\s\S]*?```/g, '')
|
|
// Remove inline code
|
|
.replace(/`([^`]+)`/g, '$1')
|
|
// Remove blockquotes
|
|
.replace(/^>\s+/gm, '')
|
|
// Remove horizontal rules
|
|
.replace(/^-{3,}$/gm, '')
|
|
// Clean up multiple newlines
|
|
.replace(/\n{3,}/g, '\n\n');
|
|
|
|
return text.trim();
|
|
}
|
|
|
|
/**
|
|
* Convert HTML to plain text
|
|
*/
|
|
function htmlToText(html: string): string {
|
|
// Sanitize HTML first
|
|
const sanitized = DOMPurify.sanitize(html, { ALLOWED_TAGS: [] });
|
|
|
|
// Clean up whitespace
|
|
return sanitized
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
/**
|
|
* Convert plain text to Markdown
|
|
*/
|
|
function textToMarkdown(text: string): string {
|
|
// Add basic markdown formatting
|
|
// Treat lines as paragraphs
|
|
return text
|
|
.split('\n\n')
|
|
.filter(p => p.trim())
|
|
.join('\n\n');
|
|
}
|
|
|
|
/**
|
|
* Convert plain text to HTML
|
|
*/
|
|
function textToHtml(text: string): string {
|
|
// Escape HTML entities
|
|
const escaped = text
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, ''');
|
|
|
|
// Convert newlines to paragraphs
|
|
const paragraphs = escaped
|
|
.split('\n\n')
|
|
.filter(p => p.trim())
|
|
.map(p => ` <p>${p.replace(/\n/g, '<br>')}</p>`)
|
|
.join('\n');
|
|
|
|
return `<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Converted Document</title>
|
|
<style>
|
|
body {
|
|
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
line-height: 1.6;
|
|
max-width: 800px;
|
|
margin: 2rem auto;
|
|
padding: 0 1rem;
|
|
color: #333;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
${paragraphs}
|
|
</body>
|
|
</html>`;
|
|
}
|
|
|
|
/**
|
|
* Get MIME type for output format
|
|
*/
|
|
function getMimeType(format: string): string {
|
|
const mimeTypes: Record<string, string> = {
|
|
html: 'text/html',
|
|
htm: 'text/html',
|
|
md: 'text/markdown',
|
|
markdown: 'text/markdown',
|
|
txt: 'text/plain',
|
|
};
|
|
|
|
return mimeTypes[format.toLowerCase()] || 'text/plain';
|
|
}
|
|
|
|
/**
|
|
* Convert Markdown to HTML (convenience function)
|
|
*/
|
|
export async function markdownToHtmlFile(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
return convertWithPandoc(file, 'html', {}, onProgress);
|
|
}
|
|
|
|
/**
|
|
* Convert HTML to Markdown (convenience function)
|
|
*/
|
|
export async function htmlToMarkdownFile(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
return convertWithPandoc(file, 'md', {}, onProgress);
|
|
}
|