import { marked } from 'marked'; import TurndownService from 'turndown'; import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion'; import { pdfToText, pdfToMarkdown, markdownToPDF, htmlToPDF, plainTextToPDF, } from './pdfService'; import { docxToText, docxToHTML, docxToMarkdown, textToDOCX, markdownToDOCX, htmlToDOCX, } from './docxService'; // Import DOMPurify only on client side let DOMPurify: any; if (typeof window !== 'undefined') { DOMPurify = require('dompurify'); } /** * Convert document using Markdown/HTML converters * Note: This uses lightweight JS libraries instead of Pandoc WASM (which isn't widely available) */ export async function convertWithPandoc( file: File, outputFormat: string, options: ConversionOptions = {}, onProgress?: ProgressCallback ): Promise { const startTime = Date.now(); try { if (onProgress) onProgress(10); // Read file content as text const text = await file.text(); if (onProgress) onProgress(30); // Detect input format from file extension or content const inputExt = file.name.split('.').pop()?.toLowerCase(); let result: string; if (onProgress) onProgress(50); // Handle PDF conversions if (inputExt === 'pdf') { // PDF input if (outputFormat === 'txt') { return await pdfToText(file, onProgress); } else if (outputFormat === 'md' || outputFormat === 'markdown') { return await pdfToMarkdown(file, onProgress); } else { throw new Error(`Conversion from PDF to ${outputFormat} not supported`); } } // Handle DOCX conversions if (inputExt === 'docx') { // DOCX input if (outputFormat === 'txt') { return await docxToText(file, onProgress); } else if (outputFormat === 'html') { return await docxToHTML(file, onProgress); } else if (outputFormat === 'md' || outputFormat === 'markdown') { return await docxToMarkdown(file, onProgress); } else { throw new Error(`Conversion from DOCX to ${outputFormat} not supported`); } } // Handle conversions TO PDF if (outputFormat === 'pdf') { if (inputExt === 'md' || inputExt === 'markdown') { return await markdownToPDF(file, onProgress); } else if (inputExt === 'html' || inputExt === 'htm') { return await htmlToPDF(file, onProgress); } else if (inputExt === 'txt') { return await plainTextToPDF(file, onProgress); } else { throw new Error(`Conversion from ${inputExt} to PDF not supported`); } } // Handle conversions TO DOCX if (outputFormat === 'docx') { if (inputExt === 'md' || inputExt === 'markdown') { return await markdownToDOCX(file, onProgress); } else if (inputExt === 'html' || inputExt === 'htm') { return await htmlToDOCX(file, onProgress); } else if (inputExt === 'txt') { return await textToDOCX(file, onProgress); } else { throw new Error(`Conversion from ${inputExt} to DOCX not supported`); } } // Perform conversion based on input and output formats if (inputExt === 'md' || inputExt === 'markdown') { // Markdown input if (outputFormat === 'html') { result = await markdownToHtml(text); } else if (outputFormat === 'txt') { result = markdownToText(text); } else { throw new Error(`Conversion from Markdown to ${outputFormat} not supported`); } } else if (inputExt === 'html' || inputExt === 'htm') { // HTML input if (outputFormat === 'md' || outputFormat === 'markdown') { result = await htmlToMarkdown(text); } else if (outputFormat === 'txt') { result = htmlToText(text); } else { throw new Error(`Conversion from HTML to ${outputFormat} not supported`); } } else if (inputExt === 'txt') { // Plain text input if (outputFormat === 'md' || outputFormat === 'markdown') { result = textToMarkdown(text); } else if (outputFormat === 'html') { result = textToHtml(text); } else { throw new Error(`Conversion from TXT to ${outputFormat} not supported`); } } else { throw new Error(`Input format ${inputExt} not supported`); } if (onProgress) onProgress(90); // Create blob from result const blob = new Blob([result], { type: getMimeType(outputFormat) }); if (onProgress) onProgress(100); const duration = Date.now() - startTime; return { success: true, blob, duration, }; } catch (error) { console.error('[Document Converter] Conversion error:', error); return { success: false, error: error instanceof Error ? error.message : 'Unknown conversion error', duration: Date.now() - startTime, }; } } /** * Convert Markdown to HTML */ async function markdownToHtml(markdown: string): Promise { // Configure marked options marked.setOptions({ gfm: true, // GitHub Flavored Markdown breaks: true, // Convert \n to
}); const html = await marked.parse(markdown); // Sanitize HTML for security const sanitized = DOMPurify.sanitize(html); // Wrap in basic HTML document return ` Converted Document ${sanitized} `; } /** * Convert HTML to Markdown */ async function htmlToMarkdown(html: string): Promise { // Sanitize HTML first const sanitized = DOMPurify.sanitize(html); // Configure TurndownService const turndownService = new TurndownService({ headingStyle: 'atx', // Use # for headings codeBlockStyle: 'fenced', // Use ``` for code blocks bulletListMarker: '-', // Use - for bullet lists }); const markdown = turndownService.turndown(sanitized); return markdown; } /** * Convert Markdown to plain text (strip formatting) */ function markdownToText(markdown: string): string { // Remove markdown syntax let text = markdown // Remove headers .replace(/^#{1,6}\s+/gm, '') // Remove bold/italic .replace(/(\*\*|__)(.*?)\1/g, '$2') .replace(/(\*|_)(.*?)\1/g, '$2') // Remove links .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // Remove images .replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1') // Remove code blocks .replace(/```[\s\S]*?```/g, '') // Remove inline code .replace(/`([^`]+)`/g, '$1') // Remove blockquotes .replace(/^>\s+/gm, '') // Remove horizontal rules .replace(/^-{3,}$/gm, '') // Clean up multiple newlines .replace(/\n{3,}/g, '\n\n'); return text.trim(); } /** * Convert HTML to plain text */ function htmlToText(html: string): string { // Sanitize HTML first const sanitized = DOMPurify.sanitize(html, { ALLOWED_TAGS: [] }); // Clean up whitespace return sanitized .replace(/\s+/g, ' ') .trim(); } /** * Convert plain text to Markdown */ function textToMarkdown(text: string): string { // Add basic markdown formatting // Treat lines as paragraphs return text .split('\n\n') .filter(p => p.trim()) .join('\n\n'); } /** * Convert plain text to HTML */ function textToHtml(text: string): string { // Escape HTML entities const escaped = text .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); // Convert newlines to paragraphs const paragraphs = escaped .split('\n\n') .filter(p => p.trim()) .map(p => `

${p.replace(/\n/g, '
')}

`) .join('\n'); return ` Converted Document ${paragraphs} `; } /** * Get MIME type for output format */ function getMimeType(format: string): string { const mimeTypes: Record = { html: 'text/html', htm: 'text/html', md: 'text/markdown', markdown: 'text/markdown', txt: 'text/plain', }; return mimeTypes[format.toLowerCase()] || 'text/plain'; } /** * Convert Markdown to HTML (convenience function) */ export async function markdownToHtmlFile( file: File, onProgress?: ProgressCallback ): Promise { return convertWithPandoc(file, 'html', {}, onProgress); } /** * Convert HTML to Markdown (convenience function) */ export async function htmlToMarkdownFile( file: File, onProgress?: ProgressCallback ): Promise { return convertWithPandoc(file, 'md', {}, onProgress); }