refactor: remove all document conversion support, keep only media conversions
This commit completely removes document conversion functionality to focus exclusively on media file conversions (video, audio, images). Changes: - Remove all document converter services (pandocService.ts, pdfService.ts, docxService.ts) - Uninstall document-related packages: marked, turndown, dompurify, jspdf, pdfjs-dist, docx, mammoth, @types/turndown - Remove document formats (PDF, DOCX, Markdown, HTML, TXT) from formatMappings.ts - Remove pandoc converter from FileConverter.tsx - Remove pandoc loader and references from wasmLoader.ts - Update TypeScript types to remove 'pandoc' from ConverterEngine and 'document' from FileCategory - Remove pandoc from WASMModuleState interface - Update README.md to remove all document conversion documentation - Update UI descriptions to reflect media-only conversions Supported conversions now: - Video: MP4, WebM, AVI, MOV, MKV, GIF - Audio: MP3, WAV, OGG, AAC, FLAC - Images: PNG, JPG, WebP, GIF, BMP, TIFF, SVG 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,443 +0,0 @@
|
||||
import { Document, Packer, Paragraph, TextRun, HeadingLevel } from 'docx';
|
||||
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
||||
|
||||
/**
|
||||
* Extract text from DOCX file using mammoth
|
||||
*/
|
||||
export async function extractTextFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Dynamically import mammoth (client-side only)
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Read file as ArrayBuffer
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Extract text from DOCX
|
||||
const result = await mammoth.extractRawText({ arrayBuffer });
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return result.value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract HTML from DOCX file using mammoth
|
||||
*/
|
||||
export async function extractHTMLFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Dynamically import mammoth (client-side only)
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Read file as ArrayBuffer
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Convert DOCX to HTML
|
||||
const result = await mammoth.convertToHtml({ arrayBuffer });
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return result.value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DOCX to plain text
|
||||
*/
|
||||
export async function docxToText(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const text = await extractTextFromDOCX(file, onProgress);
|
||||
const blob = new Blob([text], { type: 'text/plain' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] DOCX to text error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to extract text from DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DOCX to HTML
|
||||
*/
|
||||
export async function docxToHTML(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const html = await extractHTMLFromDOCX(file, (progress) => {
|
||||
if (onProgress) onProgress(progress * 0.9);
|
||||
});
|
||||
|
||||
// Wrap in full HTML document
|
||||
const fullHTML = `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Converted Document</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.6;
|
||||
max-width: 800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 1rem;
|
||||
color: #333;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
${html}
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const blob = new Blob([fullHTML], { type: 'text/html' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] DOCX to HTML error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert DOCX to HTML',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DOCX to Markdown
|
||||
*/
|
||||
export async function docxToMarkdown(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
// First convert to HTML
|
||||
const html = await extractHTMLFromDOCX(file, (progress) => {
|
||||
if (onProgress) onProgress(progress * 0.7);
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(80);
|
||||
|
||||
// Import turndown for HTML to Markdown
|
||||
const TurndownService = (await import('turndown')).default;
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
bulletListMarker: '-',
|
||||
});
|
||||
|
||||
const markdown = turndownService.turndown(html);
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const blob = new Blob([markdown], { type: 'text/markdown' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] DOCX to Markdown error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert DOCX to Markdown',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create DOCX from text content
|
||||
*/
|
||||
async function createDOCXFromText(text: string, onProgress?: ProgressCallback): Promise<Blob> {
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Split text into paragraphs
|
||||
const paragraphs = text.split('\n\n').filter(p => p.trim());
|
||||
|
||||
if (onProgress) onProgress(40);
|
||||
|
||||
// Create document with paragraphs
|
||||
const doc = new Document({
|
||||
sections: [
|
||||
{
|
||||
properties: {},
|
||||
children: paragraphs.map((para) => {
|
||||
return new Paragraph({
|
||||
children: [new TextRun(para.trim())],
|
||||
spacing: {
|
||||
after: 200,
|
||||
},
|
||||
});
|
||||
}),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(70);
|
||||
|
||||
// Generate DOCX blob
|
||||
const blob = await Packer.toBlob(doc);
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create DOCX from Markdown
|
||||
*/
|
||||
async function createDOCXFromMarkdown(markdown: string, onProgress?: ProgressCallback): Promise<Blob> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Parse markdown and create structured document
|
||||
const lines = markdown.split('\n');
|
||||
const children: Paragraph[] = [];
|
||||
|
||||
let currentParagraph: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('# ')) {
|
||||
// Heading 1
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
children.push(new Paragraph({
|
||||
text: line.substring(2),
|
||||
heading: HeadingLevel.HEADING_1,
|
||||
spacing: { before: 240, after: 120 },
|
||||
}));
|
||||
} else if (line.startsWith('## ')) {
|
||||
// Heading 2
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
children.push(new Paragraph({
|
||||
text: line.substring(3),
|
||||
heading: HeadingLevel.HEADING_2,
|
||||
spacing: { before: 200, after: 100 },
|
||||
}));
|
||||
} else if (line.startsWith('### ')) {
|
||||
// Heading 3
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
children.push(new Paragraph({
|
||||
text: line.substring(4),
|
||||
heading: HeadingLevel.HEADING_3,
|
||||
spacing: { before: 160, after: 80 },
|
||||
}));
|
||||
} else if (line.trim() === '') {
|
||||
// Empty line - paragraph break
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
} else {
|
||||
// Regular text
|
||||
currentParagraph.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Add remaining paragraph
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(60);
|
||||
|
||||
const doc = new Document({
|
||||
sections: [
|
||||
{
|
||||
properties: {},
|
||||
children,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(80);
|
||||
|
||||
const blob = await Packer.toBlob(doc);
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert plain text to DOCX
|
||||
*/
|
||||
export async function textToDOCX(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const text = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
const blob = await createDOCXFromText(text, (progress) => {
|
||||
if (onProgress) onProgress(20 + progress * 0.8);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] Text to DOCX error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert text to DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to DOCX
|
||||
*/
|
||||
export async function markdownToDOCX(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const markdown = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
const blob = await createDOCXFromMarkdown(markdown, (progress) => {
|
||||
if (onProgress) onProgress(20 + progress * 0.8);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] Markdown to DOCX error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert Markdown to DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to DOCX
|
||||
*/
|
||||
export async function htmlToDOCX(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const html = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Strip HTML tags to get plain text
|
||||
const text = html
|
||||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
||||
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
const blob = await createDOCXFromText(text, (progress) => {
|
||||
if (onProgress) onProgress(50 + progress * 0.5);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] HTML to DOCX error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert HTML to DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -1,379 +0,0 @@
|
||||
import { marked } from 'marked';
|
||||
import TurndownService from 'turndown';
|
||||
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
||||
import {
|
||||
pdfToText,
|
||||
pdfToMarkdown,
|
||||
markdownToPDF,
|
||||
htmlToPDF,
|
||||
plainTextToPDF,
|
||||
} from './pdfService';
|
||||
import {
|
||||
docxToText,
|
||||
docxToHTML,
|
||||
docxToMarkdown,
|
||||
textToDOCX,
|
||||
markdownToDOCX,
|
||||
htmlToDOCX,
|
||||
} from './docxService';
|
||||
|
||||
// Import DOMPurify only on client side
|
||||
let DOMPurify: any;
|
||||
if (typeof window !== 'undefined') {
|
||||
DOMPurify = require('dompurify');
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert document using Markdown/HTML converters
|
||||
* Note: This uses lightweight JS libraries instead of Pandoc WASM (which isn't widely available)
|
||||
*/
|
||||
export async function convertWithPandoc(
|
||||
file: File,
|
||||
outputFormat: string,
|
||||
options: ConversionOptions = {},
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Read file content as text
|
||||
const text = await file.text();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Detect input format from file extension or content
|
||||
const inputExt = file.name.split('.').pop()?.toLowerCase();
|
||||
let result: string;
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Handle PDF conversions
|
||||
if (inputExt === 'pdf') {
|
||||
// PDF input
|
||||
if (outputFormat === 'txt') {
|
||||
return await pdfToText(file, onProgress);
|
||||
} else if (outputFormat === 'md' || outputFormat === 'markdown') {
|
||||
return await pdfToMarkdown(file, onProgress);
|
||||
} else {
|
||||
throw new Error(`Conversion from PDF to ${outputFormat} not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle DOCX conversions
|
||||
if (inputExt === 'docx') {
|
||||
// DOCX input
|
||||
if (outputFormat === 'txt') {
|
||||
return await docxToText(file, onProgress);
|
||||
} else if (outputFormat === 'html') {
|
||||
return await docxToHTML(file, onProgress);
|
||||
} else if (outputFormat === 'md' || outputFormat === 'markdown') {
|
||||
return await docxToMarkdown(file, onProgress);
|
||||
} else {
|
||||
throw new Error(`Conversion from DOCX to ${outputFormat} not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle conversions TO PDF
|
||||
if (outputFormat === 'pdf') {
|
||||
if (inputExt === 'md' || inputExt === 'markdown') {
|
||||
return await markdownToPDF(file, onProgress);
|
||||
} else if (inputExt === 'html' || inputExt === 'htm') {
|
||||
return await htmlToPDF(file, onProgress);
|
||||
} else if (inputExt === 'txt') {
|
||||
return await plainTextToPDF(file, onProgress);
|
||||
} else {
|
||||
throw new Error(`Conversion from ${inputExt} to PDF not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle conversions TO DOCX
|
||||
if (outputFormat === 'docx') {
|
||||
if (inputExt === 'md' || inputExt === 'markdown') {
|
||||
return await markdownToDOCX(file, onProgress);
|
||||
} else if (inputExt === 'html' || inputExt === 'htm') {
|
||||
return await htmlToDOCX(file, onProgress);
|
||||
} else if (inputExt === 'txt') {
|
||||
return await textToDOCX(file, onProgress);
|
||||
} else {
|
||||
throw new Error(`Conversion from ${inputExt} to DOCX not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
// Perform conversion based on input and output formats
|
||||
if (inputExt === 'md' || inputExt === 'markdown') {
|
||||
// Markdown input
|
||||
if (outputFormat === 'html') {
|
||||
result = await markdownToHtml(text);
|
||||
} else if (outputFormat === 'txt') {
|
||||
result = markdownToText(text);
|
||||
} else {
|
||||
throw new Error(`Conversion from Markdown to ${outputFormat} not supported`);
|
||||
}
|
||||
} else if (inputExt === 'html' || inputExt === 'htm') {
|
||||
// HTML input
|
||||
if (outputFormat === 'md' || outputFormat === 'markdown') {
|
||||
result = await htmlToMarkdown(text);
|
||||
} else if (outputFormat === 'txt') {
|
||||
result = htmlToText(text);
|
||||
} else {
|
||||
throw new Error(`Conversion from HTML to ${outputFormat} not supported`);
|
||||
}
|
||||
} else if (inputExt === 'txt') {
|
||||
// Plain text input
|
||||
if (outputFormat === 'md' || outputFormat === 'markdown') {
|
||||
result = textToMarkdown(text);
|
||||
} else if (outputFormat === 'html') {
|
||||
result = textToHtml(text);
|
||||
} else {
|
||||
throw new Error(`Conversion from TXT to ${outputFormat} not supported`);
|
||||
}
|
||||
} else {
|
||||
throw new Error(`Input format ${inputExt} not supported`);
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(90);
|
||||
|
||||
// Create blob from result
|
||||
const blob = new Blob([result], { type: getMimeType(outputFormat) });
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[Document Converter] Conversion error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Unknown conversion error',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to HTML
|
||||
*/
|
||||
async function markdownToHtml(markdown: string): Promise<string> {
|
||||
// Configure marked options
|
||||
marked.setOptions({
|
||||
gfm: true, // GitHub Flavored Markdown
|
||||
breaks: true, // Convert \n to <br>
|
||||
});
|
||||
|
||||
const html = await marked.parse(markdown);
|
||||
|
||||
// Sanitize HTML for security
|
||||
const sanitized = DOMPurify.sanitize(html);
|
||||
|
||||
// Wrap in basic HTML document
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Converted Document</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.6;
|
||||
max-width: 800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 1rem;
|
||||
color: #333;
|
||||
}
|
||||
pre {
|
||||
background: #f4f4f4;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
padding: 1rem;
|
||||
overflow-x: auto;
|
||||
}
|
||||
code {
|
||||
background: #f4f4f4;
|
||||
padding: 0.2rem 0.4rem;
|
||||
border-radius: 3px;
|
||||
font-family: 'Courier New', monospace;
|
||||
}
|
||||
blockquote {
|
||||
border-left: 4px solid #ddd;
|
||||
margin: 1rem 0;
|
||||
padding-left: 1rem;
|
||||
color: #666;
|
||||
}
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin: 1rem 0;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 0.5rem;
|
||||
text-align: left;
|
||||
}
|
||||
th {
|
||||
background: #f4f4f4;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
${sanitized}
|
||||
</body>
|
||||
</html>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to Markdown
|
||||
*/
|
||||
async function htmlToMarkdown(html: string): Promise<string> {
|
||||
// Sanitize HTML first
|
||||
const sanitized = DOMPurify.sanitize(html);
|
||||
|
||||
// Configure TurndownService
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx', // Use # for headings
|
||||
codeBlockStyle: 'fenced', // Use ``` for code blocks
|
||||
bulletListMarker: '-', // Use - for bullet lists
|
||||
});
|
||||
|
||||
const markdown = turndownService.turndown(sanitized);
|
||||
return markdown;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to plain text (strip formatting)
|
||||
*/
|
||||
function markdownToText(markdown: string): string {
|
||||
// Remove markdown syntax
|
||||
let text = markdown
|
||||
// Remove headers
|
||||
.replace(/^#{1,6}\s+/gm, '')
|
||||
// Remove bold/italic
|
||||
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
||||
.replace(/(\*|_)(.*?)\1/g, '$2')
|
||||
// Remove links
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
||||
// Remove images
|
||||
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
|
||||
// Remove code blocks
|
||||
.replace(/```[\s\S]*?```/g, '')
|
||||
// Remove inline code
|
||||
.replace(/`([^`]+)`/g, '$1')
|
||||
// Remove blockquotes
|
||||
.replace(/^>\s+/gm, '')
|
||||
// Remove horizontal rules
|
||||
.replace(/^-{3,}$/gm, '')
|
||||
// Clean up multiple newlines
|
||||
.replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
return text.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to plain text
|
||||
*/
|
||||
function htmlToText(html: string): string {
|
||||
// Sanitize HTML first
|
||||
const sanitized = DOMPurify.sanitize(html, { ALLOWED_TAGS: [] });
|
||||
|
||||
// Clean up whitespace
|
||||
return sanitized
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert plain text to Markdown
|
||||
*/
|
||||
function textToMarkdown(text: string): string {
|
||||
// Add basic markdown formatting
|
||||
// Treat lines as paragraphs
|
||||
return text
|
||||
.split('\n\n')
|
||||
.filter(p => p.trim())
|
||||
.join('\n\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert plain text to HTML
|
||||
*/
|
||||
function textToHtml(text: string): string {
|
||||
// Escape HTML entities
|
||||
const escaped = text
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, ''');
|
||||
|
||||
// Convert newlines to paragraphs
|
||||
const paragraphs = escaped
|
||||
.split('\n\n')
|
||||
.filter(p => p.trim())
|
||||
.map(p => ` <p>${p.replace(/\n/g, '<br>')}</p>`)
|
||||
.join('\n');
|
||||
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Converted Document</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.6;
|
||||
max-width: 800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 1rem;
|
||||
color: #333;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
${paragraphs}
|
||||
</body>
|
||||
</html>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get MIME type for output format
|
||||
*/
|
||||
function getMimeType(format: string): string {
|
||||
const mimeTypes: Record<string, string> = {
|
||||
html: 'text/html',
|
||||
htm: 'text/html',
|
||||
md: 'text/markdown',
|
||||
markdown: 'text/markdown',
|
||||
txt: 'text/plain',
|
||||
};
|
||||
|
||||
return mimeTypes[format.toLowerCase()] || 'text/plain';
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to HTML (convenience function)
|
||||
*/
|
||||
export async function markdownToHtmlFile(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
return convertWithPandoc(file, 'html', {}, onProgress);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to Markdown (convenience function)
|
||||
*/
|
||||
export async function htmlToMarkdownFile(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
return convertWithPandoc(file, 'md', {}, onProgress);
|
||||
}
|
||||
@@ -1,334 +0,0 @@
|
||||
import { jsPDF } from 'jspdf';
|
||||
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
||||
|
||||
/**
|
||||
* Extract text from PDF file
|
||||
*/
|
||||
export async function extractTextFromPDF(file: File, onProgress?: ProgressCallback): Promise<string> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Dynamically import pdfjs-dist (client-side only)
|
||||
const pdfjsLib = await import('pdfjs-dist');
|
||||
|
||||
// Set worker source
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Read file as ArrayBuffer
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Load PDF document
|
||||
const loadingTask = pdfjsLib.getDocument({ data: arrayBuffer });
|
||||
const pdf = await loadingTask.promise;
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
const numPages = pdf.numPages;
|
||||
let fullText = '';
|
||||
|
||||
// Extract text from each page
|
||||
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const textContent = await page.getTextContent();
|
||||
|
||||
// Combine text items
|
||||
const pageText = textContent.items
|
||||
.map((item: any) => item.str)
|
||||
.join(' ');
|
||||
|
||||
fullText += pageText + '\n\n';
|
||||
|
||||
// Update progress
|
||||
if (onProgress) {
|
||||
const progress = 50 + (pageNum / numPages) * 40;
|
||||
onProgress(Math.round(progress));
|
||||
}
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return fullText.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert PDF to text
|
||||
*/
|
||||
export async function pdfToText(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const text = await extractTextFromPDF(file, onProgress);
|
||||
const blob = new Blob([text], { type: 'text/plain' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] PDF to text error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to extract text from PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert PDF to Markdown
|
||||
*/
|
||||
export async function pdfToMarkdown(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const text = await extractTextFromPDF(file, (progress) => {
|
||||
if (onProgress) onProgress(progress * 0.9); // Use 90% for extraction
|
||||
});
|
||||
|
||||
// Basic text to markdown conversion (paragraphs)
|
||||
const markdown = text
|
||||
.split('\n\n')
|
||||
.filter(p => p.trim())
|
||||
.join('\n\n');
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const blob = new Blob([markdown], { type: 'text/markdown' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] PDF to markdown error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert PDF to Markdown',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert text to PDF
|
||||
*/
|
||||
export async function textToPDF(
|
||||
text: string,
|
||||
filename: string = 'document.pdf',
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<Blob> {
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
const doc = new jsPDF({
|
||||
orientation: 'portrait',
|
||||
unit: 'mm',
|
||||
format: 'a4',
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(40);
|
||||
|
||||
// Set font and size
|
||||
doc.setFont('helvetica');
|
||||
doc.setFontSize(12);
|
||||
|
||||
// Page dimensions
|
||||
const pageWidth = doc.internal.pageSize.getWidth();
|
||||
const pageHeight = doc.internal.pageSize.getHeight();
|
||||
const margin = 20;
|
||||
const maxWidth = pageWidth - 2 * margin;
|
||||
const lineHeight = 7;
|
||||
let y = margin;
|
||||
|
||||
if (onProgress) onProgress(60);
|
||||
|
||||
// Split text into lines
|
||||
const lines = doc.splitTextToSize(text, maxWidth);
|
||||
|
||||
// Add lines to PDF
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
// Check if we need a new page
|
||||
if (y + lineHeight > pageHeight - margin) {
|
||||
doc.addPage();
|
||||
y = margin;
|
||||
}
|
||||
|
||||
doc.text(lines[i], margin, y);
|
||||
y += lineHeight;
|
||||
|
||||
// Update progress
|
||||
if (onProgress && i % 10 === 0) {
|
||||
const progress = 60 + (i / lines.length) * 30;
|
||||
onProgress(Math.round(progress));
|
||||
}
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(90);
|
||||
|
||||
// Generate PDF blob
|
||||
const pdfBlob = doc.output('blob');
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return pdfBlob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to PDF
|
||||
*/
|
||||
export async function markdownToPDF(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Read markdown content
|
||||
const markdown = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Import marked for markdown parsing
|
||||
const { marked } = await import('marked');
|
||||
|
||||
// Parse markdown to HTML
|
||||
const html = await marked.parse(markdown);
|
||||
|
||||
if (onProgress) onProgress(40);
|
||||
|
||||
// Strip HTML tags for plain text
|
||||
const text = html
|
||||
.replace(/<[^>]*>/g, '')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&');
|
||||
|
||||
if (onProgress) onProgress(60);
|
||||
|
||||
// Generate PDF
|
||||
const pdfBlob = await textToPDF(text, file.name.replace(/\.md$/, '.pdf'), (progress) => {
|
||||
if (onProgress) onProgress(60 + progress * 0.4);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob: pdfBlob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] Markdown to PDF error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert Markdown to PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to PDF
|
||||
*/
|
||||
export async function htmlToPDF(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Read HTML content
|
||||
const html = await file.text();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Strip HTML tags for plain text
|
||||
const text = html
|
||||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
||||
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Generate PDF
|
||||
const pdfBlob = await textToPDF(text, file.name.replace(/\.html?$/, '.pdf'), (progress) => {
|
||||
if (onProgress) onProgress(50 + progress * 0.5);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob: pdfBlob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] HTML to PDF error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert HTML to PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert plain text to PDF
|
||||
*/
|
||||
export async function plainTextToPDF(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const text = await file.text();
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
const pdfBlob = await textToPDF(text, file.name.replace(/\.txt$/, '.pdf'), (progress) => {
|
||||
if (onProgress) onProgress(30 + progress * 0.7);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob: pdfBlob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[PDF Converter] Text to PDF error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert text to PDF',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -162,53 +162,6 @@ export const SUPPORTED_FORMATS: ConversionFormat[] = [
|
||||
converter: 'imagemagick',
|
||||
description: 'Scalable Vector Graphics',
|
||||
},
|
||||
|
||||
// Document formats (Pandoc - future implementation)
|
||||
{
|
||||
id: 'pdf',
|
||||
name: 'PDF',
|
||||
extension: 'pdf',
|
||||
mimeType: 'application/pdf',
|
||||
category: 'document',
|
||||
converter: 'pandoc',
|
||||
description: 'Portable Document Format',
|
||||
},
|
||||
{
|
||||
id: 'docx',
|
||||
name: 'DOCX',
|
||||
extension: 'docx',
|
||||
mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
category: 'document',
|
||||
converter: 'pandoc',
|
||||
description: 'Microsoft Word document',
|
||||
},
|
||||
{
|
||||
id: 'markdown',
|
||||
name: 'Markdown',
|
||||
extension: 'md',
|
||||
mimeType: 'text/markdown',
|
||||
category: 'document',
|
||||
converter: 'pandoc',
|
||||
description: 'Markdown text',
|
||||
},
|
||||
{
|
||||
id: 'html',
|
||||
name: 'HTML',
|
||||
extension: 'html',
|
||||
mimeType: 'text/html',
|
||||
category: 'document',
|
||||
converter: 'pandoc',
|
||||
description: 'HyperText Markup Language',
|
||||
},
|
||||
{
|
||||
id: 'txt',
|
||||
name: 'Plain Text',
|
||||
extension: 'txt',
|
||||
mimeType: 'text/plain',
|
||||
category: 'document',
|
||||
converter: 'pandoc',
|
||||
description: 'Plain text file',
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
|
||||
@@ -7,7 +7,6 @@ import type { ConverterEngine, WASMModuleState } from '@/types/conversion';
|
||||
const moduleState: WASMModuleState = {
|
||||
ffmpeg: false,
|
||||
imagemagick: false,
|
||||
pandoc: false,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -15,7 +14,6 @@ const moduleState: WASMModuleState = {
|
||||
*/
|
||||
let ffmpegInstance: FFmpeg | null = null;
|
||||
let imagemagickInstance: any = null;
|
||||
let pandocInstance: any = null;
|
||||
|
||||
/**
|
||||
* Load FFmpeg WASM module
|
||||
@@ -75,33 +73,6 @@ export async function loadImageMagick(): Promise<any> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load Pandoc converter (uses pure JavaScript libraries, not WASM)
|
||||
* Note: We use marked + turndown instead of actual Pandoc WASM
|
||||
*/
|
||||
export async function loadPandoc(): Promise<any> {
|
||||
if (pandocInstance && moduleState.pandoc) {
|
||||
return pandocInstance;
|
||||
}
|
||||
|
||||
try {
|
||||
// Import the converter libraries
|
||||
const [marked, turndown] = await Promise.all([
|
||||
import('marked'),
|
||||
import('turndown'),
|
||||
]);
|
||||
|
||||
pandocInstance = { marked, turndown };
|
||||
moduleState.pandoc = true;
|
||||
console.log('Document converter loaded successfully');
|
||||
|
||||
return pandocInstance;
|
||||
} catch (error) {
|
||||
console.error('Failed to load document converter:', error);
|
||||
throw new Error('Failed to load document converter');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded module state
|
||||
*/
|
||||
@@ -125,8 +96,6 @@ export async function loadModule(engine: ConverterEngine): Promise<any> {
|
||||
return loadFFmpeg();
|
||||
case 'imagemagick':
|
||||
return loadImageMagick();
|
||||
case 'pandoc':
|
||||
return loadPandoc();
|
||||
default:
|
||||
throw new Error(`Unknown converter engine: ${engine}`);
|
||||
}
|
||||
@@ -148,10 +117,5 @@ export function unloadAll(): void {
|
||||
moduleState.imagemagick = false;
|
||||
}
|
||||
|
||||
if (pandocInstance) {
|
||||
pandocInstance = null;
|
||||
moduleState.pandoc = false;
|
||||
}
|
||||
|
||||
console.log('All WASM modules unloaded');
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user