feat: add document conversion support (Markdown, HTML, Plain Text)

- Add marked for Markdown to HTML conversion with GFM support
- Add turndown for HTML to Markdown conversion
- Add DOMPurify for HTML sanitization (security)
- Support Markdown ↔ HTML ↔ Plain Text conversions
- Add styled HTML output with responsive design
- Use client-side only DOMPurify to fix SSR issues

Supported conversions:
- Markdown → HTML (with code syntax, tables, blockquotes)
- HTML → Markdown (clean formatting preservation)
- Markdown/HTML → Plain Text (strip formatting)
- Plain Text → HTML/Markdown (basic formatting)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-17 11:01:08 +01:00
parent 1d9f10fd32
commit 9de639b138
5 changed files with 367 additions and 20 deletions

View File

@@ -1,7 +1,16 @@
import { marked } from 'marked';
import TurndownService from 'turndown';
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
// Import DOMPurify only on client side
let DOMPurify: any;
if (typeof window !== 'undefined') {
DOMPurify = require('dompurify');
}
/**
* Convert document using Pandoc (placeholder - not yet implemented)
* Convert document using Markdown/HTML converters
* Note: This uses lightweight JS libraries instead of Pandoc WASM (which isn't widely available)
*/
export async function convertWithPandoc(
file: File,
@@ -9,21 +18,283 @@ export async function convertWithPandoc(
options: ConversionOptions = {},
onProgress?: ProgressCallback
): Promise<ConversionResult> {
// TODO: Implement Pandoc WASM conversion when available
// For now, return an error
const startTime = Date.now();
if (onProgress) onProgress(0);
try {
if (onProgress) onProgress(10);
return {
success: false,
error: 'Pandoc WASM converter is not yet implemented. Document conversion coming soon!',
};
// Read file content as text
const text = await file.text();
if (onProgress) onProgress(30);
// Detect input format from file extension or content
const inputExt = file.name.split('.').pop()?.toLowerCase();
let result: string;
if (onProgress) onProgress(50);
// Perform conversion based on input and output formats
if (inputExt === 'md' || inputExt === 'markdown') {
// Markdown input
if (outputFormat === 'html') {
result = await markdownToHtml(text);
} else if (outputFormat === 'txt') {
result = markdownToText(text);
} else {
throw new Error(`Conversion from Markdown to ${outputFormat} not supported`);
}
} else if (inputExt === 'html' || inputExt === 'htm') {
// HTML input
if (outputFormat === 'md' || outputFormat === 'markdown') {
result = await htmlToMarkdown(text);
} else if (outputFormat === 'txt') {
result = htmlToText(text);
} else {
throw new Error(`Conversion from HTML to ${outputFormat} not supported`);
}
} else if (inputExt === 'txt') {
// Plain text input
if (outputFormat === 'md' || outputFormat === 'markdown') {
result = textToMarkdown(text);
} else if (outputFormat === 'html') {
result = textToHtml(text);
} else {
throw new Error(`Conversion from TXT to ${outputFormat} not supported`);
}
} else {
throw new Error(`Input format ${inputExt} not supported`);
}
if (onProgress) onProgress(90);
// Create blob from result
const blob = new Blob([result], { type: getMimeType(outputFormat) });
if (onProgress) onProgress(100);
const duration = Date.now() - startTime;
return {
success: true,
blob,
duration,
};
} catch (error) {
console.error('[Document Converter] Conversion error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Unknown conversion error',
duration: Date.now() - startTime,
};
}
}
/**
* Convert Markdown to HTML (placeholder)
* Convert Markdown to HTML
*/
export async function markdownToHtml(
async function markdownToHtml(markdown: string): Promise<string> {
// Configure marked options
marked.setOptions({
gfm: true, // GitHub Flavored Markdown
breaks: true, // Convert \n to <br>
});
const html = await marked.parse(markdown);
// Sanitize HTML for security
const sanitized = DOMPurify.sanitize(html);
// Wrap in basic HTML document
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
pre {
background: #f4f4f4;
border: 1px solid #ddd;
border-radius: 4px;
padding: 1rem;
overflow-x: auto;
}
code {
background: #f4f4f4;
padding: 0.2rem 0.4rem;
border-radius: 3px;
font-family: 'Courier New', monospace;
}
blockquote {
border-left: 4px solid #ddd;
margin: 1rem 0;
padding-left: 1rem;
color: #666;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1rem 0;
}
th, td {
border: 1px solid #ddd;
padding: 0.5rem;
text-align: left;
}
th {
background: #f4f4f4;
}
</style>
</head>
<body>
${sanitized}
</body>
</html>`;
}
/**
* Convert HTML to Markdown
*/
async function htmlToMarkdown(html: string): Promise<string> {
// Sanitize HTML first
const sanitized = DOMPurify.sanitize(html);
// Configure TurndownService
const turndownService = new TurndownService({
headingStyle: 'atx', // Use # for headings
codeBlockStyle: 'fenced', // Use ``` for code blocks
bulletListMarker: '-', // Use - for bullet lists
});
const markdown = turndownService.turndown(sanitized);
return markdown;
}
/**
* Convert Markdown to plain text (strip formatting)
*/
function markdownToText(markdown: string): string {
// Remove markdown syntax
let text = markdown
// Remove headers
.replace(/^#{1,6}\s+/gm, '')
// Remove bold/italic
.replace(/(\*\*|__)(.*?)\1/g, '$2')
.replace(/(\*|_)(.*?)\1/g, '$2')
// Remove links
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
// Remove images
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
// Remove code blocks
.replace(/```[\s\S]*?```/g, '')
// Remove inline code
.replace(/`([^`]+)`/g, '$1')
// Remove blockquotes
.replace(/^>\s+/gm, '')
// Remove horizontal rules
.replace(/^-{3,}$/gm, '')
// Clean up multiple newlines
.replace(/\n{3,}/g, '\n\n');
return text.trim();
}
/**
* Convert HTML to plain text
*/
function htmlToText(html: string): string {
// Sanitize HTML first
const sanitized = DOMPurify.sanitize(html, { ALLOWED_TAGS: [] });
// Clean up whitespace
return sanitized
.replace(/\s+/g, ' ')
.trim();
}
/**
* Convert plain text to Markdown
*/
function textToMarkdown(text: string): string {
// Add basic markdown formatting
// Treat lines as paragraphs
return text
.split('\n\n')
.filter(p => p.trim())
.join('\n\n');
}
/**
* Convert plain text to HTML
*/
function textToHtml(text: string): string {
// Escape HTML entities
const escaped = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
// Convert newlines to paragraphs
const paragraphs = escaped
.split('\n\n')
.filter(p => p.trim())
.map(p => ` <p>${p.replace(/\n/g, '<br>')}</p>`)
.join('\n');
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
</style>
</head>
<body>
${paragraphs}
</body>
</html>`;
}
/**
* Get MIME type for output format
*/
function getMimeType(format: string): string {
const mimeTypes: Record<string, string> = {
html: 'text/html',
htm: 'text/html',
md: 'text/markdown',
markdown: 'text/markdown',
txt: 'text/plain',
};
return mimeTypes[format.toLowerCase()] || 'text/plain';
}
/**
* Convert Markdown to HTML (convenience function)
*/
export async function markdownToHtmlFile(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
@@ -31,9 +302,9 @@ export async function markdownToHtml(
}
/**
* Convert HTML to Markdown (placeholder)
* Convert HTML to Markdown (convenience function)
*/
export async function htmlToMarkdown(
export async function htmlToMarkdownFile(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {

View File

@@ -76,16 +76,30 @@ export async function loadImageMagick(): Promise<any> {
}
/**
* Load Pandoc WASM module (placeholder for future implementation)
* Load Pandoc converter (uses pure JavaScript libraries, not WASM)
* Note: We use marked + turndown instead of actual Pandoc WASM
*/
export async function loadPandoc(): Promise<any> {
if (pandocInstance && moduleState.pandoc) {
return pandocInstance;
}
// TODO: Implement Pandoc WASM loading when available
// For now, throw an error
throw new Error('Pandoc WASM module is not yet implemented');
try {
// Import the converter libraries
const [marked, turndown] = await Promise.all([
import('marked'),
import('turndown'),
]);
pandocInstance = { marked, turndown };
moduleState.pandoc = true;
console.log('Document converter loaded successfully');
return pandocInstance;
} catch (error) {
console.error('Failed to load document converter:', error);
throw new Error('Failed to load document converter');
}
}
/**