feat: add document conversion support (Markdown, HTML, Plain Text)

- Add marked for Markdown to HTML conversion with GFM support
- Add turndown for HTML to Markdown conversion
- Add DOMPurify for HTML sanitization (security)
- Support Markdown ↔ HTML ↔ Plain Text conversions
- Add styled HTML output with responsive design
- Use client-side only DOMPurify to fix SSR issues

Supported conversions:
- Markdown → HTML (with code syntax, tables, blockquotes)
- HTML → Markdown (clean formatting preservation)
- Markdown/HTML → Plain Text (strip formatting)
- Plain Text → HTML/Markdown (basic formatting)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-17 11:01:08 +01:00
parent 1d9f10fd32
commit 9de639b138
5 changed files with 367 additions and 20 deletions

View File

@@ -7,7 +7,7 @@ A modern, browser-based file conversion application built with Next.js 16, Tailw
- **🎬 Video Conversion** - Convert between MP4, WebM, AVI, MOV, MKV, and GIF
- **🎵 Audio Conversion** - Convert between MP3, WAV, OGG, AAC, and FLAC
- **🖼️ Image Conversion** - Convert between PNG, JPG, WebP, GIF, BMP, TIFF, and SVG
- **📄 Document Conversion** - (Coming soon) Convert between PDF, DOCX, Markdown, HTML, and TXT
- **📄 Document Conversion** - Convert between Markdown, HTML, and Plain Text
- **🔒 Privacy First** - All conversions happen locally in your browser, no server uploads
- **⚡ Fast & Efficient** - Powered by WebAssembly for near-native performance
- **🎨 Beautiful UI** - Modern, responsive design with dark/light theme support
@@ -23,6 +23,9 @@ A modern, browser-based file conversion application built with Next.js 16, Tailw
- **Tailwind CSS 4** - Utility-first CSS with OKLCH color system
- **FFmpeg.wasm** - Video and audio conversion
- **ImageMagick WASM** - Image processing and conversion
- **Marked** - Markdown to HTML conversion
- **Turndown** - HTML to Markdown conversion
- **DOMPurify** - HTML sanitization
- **Fuse.js** - Fuzzy search for format selection
- **Lucide React** - Beautiful icon library
@@ -111,8 +114,14 @@ convert-ui/
### Images (ImageMagick)
- **Input/Output:** PNG, JPG, WebP, GIF, BMP, TIFF, SVG
### Documents (Coming Soon)
- **Planned:** PDF, DOCX, Markdown, HTML, Plain Text
### Documents
- **Markdown → HTML** - Full GitHub Flavored Markdown support with styling
- **HTML → Markdown** - Clean conversion with formatting preservation
- **Markdown ↔ Plain Text** - Strip or add basic formatting
- **HTML → Plain Text** - Extract text content
- **Plain Text → HTML** - Convert to formatted HTML document
**Note:** Uses lightweight JavaScript libraries (marked, turndown) instead of Pandoc WASM for fast, reliable conversions.
## How It Works

View File

@@ -1,7 +1,16 @@
import { marked } from 'marked';
import TurndownService from 'turndown';
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
// Import DOMPurify only on client side
let DOMPurify: any;
if (typeof window !== 'undefined') {
DOMPurify = require('dompurify');
}
/**
* Convert document using Pandoc (placeholder - not yet implemented)
* Convert document using Markdown/HTML converters
* Note: This uses lightweight JS libraries instead of Pandoc WASM (which isn't widely available)
*/
export async function convertWithPandoc(
file: File,
@@ -9,21 +18,283 @@ export async function convertWithPandoc(
options: ConversionOptions = {},
onProgress?: ProgressCallback
): Promise<ConversionResult> {
// TODO: Implement Pandoc WASM conversion when available
// For now, return an error
const startTime = Date.now();
if (onProgress) onProgress(0);
try {
if (onProgress) onProgress(10);
return {
success: false,
error: 'Pandoc WASM converter is not yet implemented. Document conversion coming soon!',
};
// Read file content as text
const text = await file.text();
if (onProgress) onProgress(30);
// Detect input format from file extension or content
const inputExt = file.name.split('.').pop()?.toLowerCase();
let result: string;
if (onProgress) onProgress(50);
// Perform conversion based on input and output formats
if (inputExt === 'md' || inputExt === 'markdown') {
// Markdown input
if (outputFormat === 'html') {
result = await markdownToHtml(text);
} else if (outputFormat === 'txt') {
result = markdownToText(text);
} else {
throw new Error(`Conversion from Markdown to ${outputFormat} not supported`);
}
} else if (inputExt === 'html' || inputExt === 'htm') {
// HTML input
if (outputFormat === 'md' || outputFormat === 'markdown') {
result = await htmlToMarkdown(text);
} else if (outputFormat === 'txt') {
result = htmlToText(text);
} else {
throw new Error(`Conversion from HTML to ${outputFormat} not supported`);
}
} else if (inputExt === 'txt') {
// Plain text input
if (outputFormat === 'md' || outputFormat === 'markdown') {
result = textToMarkdown(text);
} else if (outputFormat === 'html') {
result = textToHtml(text);
} else {
throw new Error(`Conversion from TXT to ${outputFormat} not supported`);
}
} else {
throw new Error(`Input format ${inputExt} not supported`);
}
if (onProgress) onProgress(90);
// Create blob from result
const blob = new Blob([result], { type: getMimeType(outputFormat) });
if (onProgress) onProgress(100);
const duration = Date.now() - startTime;
return {
success: true,
blob,
duration,
};
} catch (error) {
console.error('[Document Converter] Conversion error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Unknown conversion error',
duration: Date.now() - startTime,
};
}
}
/**
* Convert Markdown to HTML (placeholder)
* Convert Markdown to HTML
*/
export async function markdownToHtml(
async function markdownToHtml(markdown: string): Promise<string> {
// Configure marked options
marked.setOptions({
gfm: true, // GitHub Flavored Markdown
breaks: true, // Convert \n to <br>
});
const html = await marked.parse(markdown);
// Sanitize HTML for security
const sanitized = DOMPurify.sanitize(html);
// Wrap in basic HTML document
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
pre {
background: #f4f4f4;
border: 1px solid #ddd;
border-radius: 4px;
padding: 1rem;
overflow-x: auto;
}
code {
background: #f4f4f4;
padding: 0.2rem 0.4rem;
border-radius: 3px;
font-family: 'Courier New', monospace;
}
blockquote {
border-left: 4px solid #ddd;
margin: 1rem 0;
padding-left: 1rem;
color: #666;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1rem 0;
}
th, td {
border: 1px solid #ddd;
padding: 0.5rem;
text-align: left;
}
th {
background: #f4f4f4;
}
</style>
</head>
<body>
${sanitized}
</body>
</html>`;
}
/**
* Convert HTML to Markdown
*/
async function htmlToMarkdown(html: string): Promise<string> {
// Sanitize HTML first
const sanitized = DOMPurify.sanitize(html);
// Configure TurndownService
const turndownService = new TurndownService({
headingStyle: 'atx', // Use # for headings
codeBlockStyle: 'fenced', // Use ``` for code blocks
bulletListMarker: '-', // Use - for bullet lists
});
const markdown = turndownService.turndown(sanitized);
return markdown;
}
/**
* Convert Markdown to plain text (strip formatting)
*/
function markdownToText(markdown: string): string {
// Remove markdown syntax
let text = markdown
// Remove headers
.replace(/^#{1,6}\s+/gm, '')
// Remove bold/italic
.replace(/(\*\*|__)(.*?)\1/g, '$2')
.replace(/(\*|_)(.*?)\1/g, '$2')
// Remove links
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
// Remove images
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
// Remove code blocks
.replace(/```[\s\S]*?```/g, '')
// Remove inline code
.replace(/`([^`]+)`/g, '$1')
// Remove blockquotes
.replace(/^>\s+/gm, '')
// Remove horizontal rules
.replace(/^-{3,}$/gm, '')
// Clean up multiple newlines
.replace(/\n{3,}/g, '\n\n');
return text.trim();
}
/**
* Convert HTML to plain text
*/
function htmlToText(html: string): string {
// Sanitize HTML first
const sanitized = DOMPurify.sanitize(html, { ALLOWED_TAGS: [] });
// Clean up whitespace
return sanitized
.replace(/\s+/g, ' ')
.trim();
}
/**
* Convert plain text to Markdown
*/
function textToMarkdown(text: string): string {
// Add basic markdown formatting
// Treat lines as paragraphs
return text
.split('\n\n')
.filter(p => p.trim())
.join('\n\n');
}
/**
* Convert plain text to HTML
*/
function textToHtml(text: string): string {
// Escape HTML entities
const escaped = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
// Convert newlines to paragraphs
const paragraphs = escaped
.split('\n\n')
.filter(p => p.trim())
.map(p => ` <p>${p.replace(/\n/g, '<br>')}</p>`)
.join('\n');
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
</style>
</head>
<body>
${paragraphs}
</body>
</html>`;
}
/**
* Get MIME type for output format
*/
function getMimeType(format: string): string {
const mimeTypes: Record<string, string> = {
html: 'text/html',
htm: 'text/html',
md: 'text/markdown',
markdown: 'text/markdown',
txt: 'text/plain',
};
return mimeTypes[format.toLowerCase()] || 'text/plain';
}
/**
* Convert Markdown to HTML (convenience function)
*/
export async function markdownToHtmlFile(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
@@ -31,9 +302,9 @@ export async function markdownToHtml(
}
/**
* Convert HTML to Markdown (placeholder)
* Convert HTML to Markdown (convenience function)
*/
export async function htmlToMarkdown(
export async function htmlToMarkdownFile(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {

View File

@@ -76,16 +76,30 @@ export async function loadImageMagick(): Promise<any> {
}
/**
* Load Pandoc WASM module (placeholder for future implementation)
* Load Pandoc converter (uses pure JavaScript libraries, not WASM)
* Note: We use marked + turndown instead of actual Pandoc WASM
*/
export async function loadPandoc(): Promise<any> {
if (pandocInstance && moduleState.pandoc) {
return pandocInstance;
}
// TODO: Implement Pandoc WASM loading when available
// For now, throw an error
throw new Error('Pandoc WASM module is not yet implemented');
try {
// Import the converter libraries
const [marked, turndown] = await Promise.all([
import('marked'),
import('turndown'),
]);
pandocInstance = { marked, turndown };
moduleState.pandoc = true;
console.log('Document converter loaded successfully');
return pandocInstance;
} catch (error) {
console.error('Failed to load document converter:', error);
throw new Error('Failed to load document converter');
}
}
/**

View File

@@ -13,18 +13,22 @@
"@ffmpeg/util": "^0.12.1",
"@imagemagick/magick-wasm": "^0.0.30",
"clsx": "^2.1.1",
"dompurify": "^3.2.2",
"fuse.js": "^7.1.0",
"lucide-react": "^0.553.0",
"marked": "^15.0.4",
"next": "^16.0.0",
"react": "^19.0.0",
"react-dom": "^19.0.0",
"tailwind-merge": "^3.3.1"
"tailwind-merge": "^3.3.1",
"turndown": "^7.2.0"
},
"devDependencies": {
"@tailwindcss/postcss": "^4.1.17",
"@types/node": "^22",
"@types/react": "^19",
"@types/react-dom": "^19",
"@types/turndown": "^5.0.5",
"eslint": "^9",
"eslint-config-next": "^16.0.0",
"tailwindcss": "^4.0.0",

49
pnpm-lock.yaml generated
View File

@@ -20,12 +20,18 @@ importers:
clsx:
specifier: ^2.1.1
version: 2.1.1
dompurify:
specifier: ^3.2.2
version: 3.3.0
fuse.js:
specifier: ^7.1.0
version: 7.1.0
lucide-react:
specifier: ^0.553.0
version: 0.553.0(react@19.2.0)
marked:
specifier: ^15.0.4
version: 15.0.12
next:
specifier: ^16.0.0
version: 16.0.3(@babel/core@7.28.5)(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
@@ -38,6 +44,9 @@ importers:
tailwind-merge:
specifier: ^3.3.1
version: 3.4.0
turndown:
specifier: ^7.2.0
version: 7.2.2
devDependencies:
'@tailwindcss/postcss':
specifier: ^4.1.17
@@ -51,6 +60,9 @@ importers:
'@types/react-dom':
specifier: ^19
version: 19.2.3(@types/react@19.2.5)
'@types/turndown':
specifier: ^5.0.5
version: 5.0.6
eslint:
specifier: ^9
version: 9.39.1(jiti@2.6.1)
@@ -368,6 +380,9 @@ packages:
'@jridgewell/trace-mapping@0.3.31':
resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==}
'@mixmark-io/domino@2.2.0':
resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==}
'@napi-rs/wasm-runtime@0.2.12':
resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==}
@@ -558,6 +573,12 @@ packages:
'@types/react@19.2.5':
resolution: {integrity: sha512-keKxkZMqnDicuvFoJbzrhbtdLSPhj/rZThDlKWCDbgXmUg0rEUFtRssDXKYmtXluZlIqiC5VqkCgRwzuyLHKHw==}
'@types/trusted-types@2.0.7':
resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==}
'@types/turndown@5.0.6':
resolution: {integrity: sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==}
'@typescript-eslint/eslint-plugin@8.46.4':
resolution: {integrity: sha512-R48VhmTJqplNyDxCyqqVkFSZIx1qX6PzwqgcXn1olLrzxcSBDlOsbtcnQuQhNtnNiJ4Xe5gREI1foajYaYU2Vg==}
engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
@@ -910,6 +931,9 @@ packages:
resolution: {integrity: sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==}
engines: {node: '>=0.10.0'}
dompurify@3.3.0:
resolution: {integrity: sha512-r+f6MYR1gGN1eJv0TVQbhA7if/U7P87cdPl3HN5rikqaBSBxLiCb/b9O+2eG0cxz0ghyU+mU1QkbsOwERMYlWQ==}
dunder-proto@1.0.1:
resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==}
engines: {node: '>= 0.4'}
@@ -1509,6 +1533,11 @@ packages:
magic-string@0.30.21:
resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==}
marked@15.0.12:
resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
engines: {node: '>= 18'}
hasBin: true
math-intrinsics@1.1.0:
resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
engines: {node: '>= 0.4'}
@@ -1871,6 +1900,9 @@ packages:
tslib@2.8.1:
resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==}
turndown@7.2.2:
resolution: {integrity: sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==}
type-check@0.4.0:
resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
engines: {node: '>= 0.8.0'}
@@ -2266,6 +2298,8 @@ snapshots:
'@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.5.5
'@mixmark-io/domino@2.2.0': {}
'@napi-rs/wasm-runtime@0.2.12':
dependencies:
'@emnapi/core': 1.7.1
@@ -2415,6 +2449,11 @@ snapshots:
dependencies:
csstype: 3.2.2
'@types/trusted-types@2.0.7':
optional: true
'@types/turndown@5.0.6': {}
'@typescript-eslint/eslint-plugin@8.46.4(@typescript-eslint/parser@8.46.4(eslint@9.39.1(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.1(jiti@2.6.1))(typescript@5.9.3)':
dependencies:
'@eslint-community/regexpp': 4.12.2
@@ -2788,6 +2827,10 @@ snapshots:
dependencies:
esutils: 2.0.3
dompurify@3.3.0:
optionalDependencies:
'@types/trusted-types': 2.0.7
dunder-proto@1.0.1:
dependencies:
call-bind-apply-helpers: 1.0.2
@@ -3520,6 +3563,8 @@ snapshots:
dependencies:
'@jridgewell/sourcemap-codec': 1.5.5
marked@15.0.12: {}
math-intrinsics@1.1.0: {}
merge2@1.4.1: {}
@@ -3947,6 +3992,10 @@ snapshots:
tslib@2.8.1: {}
turndown@7.2.2:
dependencies:
'@mixmark-io/domino': 2.2.0
type-check@0.4.0:
dependencies:
prelude-ls: 1.2.1