Files
convert-ui/lib/converters/docxService.ts
Sebastian Krüger de3997f4df feat: add comprehensive DOCX document support
- Install docx (v9.5.1) and mammoth (v1.11.0) packages
- Create docxService.ts with full DOCX read/write functionality:
  - Extract text, HTML, and Markdown from DOCX files using mammoth
  - Generate DOCX files from Markdown with proper heading levels (H1-H3)
  - Generate DOCX files from HTML and plain text
  - Automatic paragraph formatting and spacing
- Integrate DOCX conversions into pandocService.ts
- Update README with DOCX support documentation
- Add DOCX libraries to tech stack section

Supported DOCX conversions:
- DOCX → Text/HTML/Markdown
- Markdown/HTML/Text → DOCX

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 11:25:23 +01:00

444 lines
10 KiB
TypeScript

import { Document, Packer, Paragraph, TextRun, HeadingLevel } from 'docx';
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
/**
* Extract text from DOCX file using mammoth
*/
export async function extractTextFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
if (onProgress) onProgress(10);
// Dynamically import mammoth (client-side only)
const mammoth = await import('mammoth');
if (onProgress) onProgress(30);
// Read file as ArrayBuffer
const arrayBuffer = await file.arrayBuffer();
if (onProgress) onProgress(50);
// Extract text from DOCX
const result = await mammoth.extractRawText({ arrayBuffer });
if (onProgress) onProgress(100);
return result.value;
}
/**
* Extract HTML from DOCX file using mammoth
*/
export async function extractHTMLFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
if (onProgress) onProgress(10);
// Dynamically import mammoth (client-side only)
const mammoth = await import('mammoth');
if (onProgress) onProgress(30);
// Read file as ArrayBuffer
const arrayBuffer = await file.arrayBuffer();
if (onProgress) onProgress(50);
// Convert DOCX to HTML
const result = await mammoth.convertToHtml({ arrayBuffer });
if (onProgress) onProgress(100);
return result.value;
}
/**
* Convert DOCX to plain text
*/
export async function docxToText(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
const text = await extractTextFromDOCX(file, onProgress);
const blob = new Blob([text], { type: 'text/plain' });
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[DOCX Converter] DOCX to text error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to extract text from DOCX',
duration: Date.now() - startTime,
};
}
}
/**
* Convert DOCX to HTML
*/
export async function docxToHTML(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
const html = await extractHTMLFromDOCX(file, (progress) => {
if (onProgress) onProgress(progress * 0.9);
});
// Wrap in full HTML document
const fullHTML = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 2rem auto;
padding: 0 1rem;
color: #333;
}
</style>
</head>
<body>
${html}
</body>
</html>`;
if (onProgress) onProgress(100);
const blob = new Blob([fullHTML], { type: 'text/html' });
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[DOCX Converter] DOCX to HTML error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert DOCX to HTML',
duration: Date.now() - startTime,
};
}
}
/**
* Convert DOCX to Markdown
*/
export async function docxToMarkdown(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
// First convert to HTML
const html = await extractHTMLFromDOCX(file, (progress) => {
if (onProgress) onProgress(progress * 0.7);
});
if (onProgress) onProgress(80);
// Import turndown for HTML to Markdown
const TurndownService = (await import('turndown')).default;
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
});
const markdown = turndownService.turndown(html);
if (onProgress) onProgress(100);
const blob = new Blob([markdown], { type: 'text/markdown' });
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[DOCX Converter] DOCX to Markdown error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert DOCX to Markdown',
duration: Date.now() - startTime,
};
}
}
/**
* Create DOCX from text content
*/
async function createDOCXFromText(text: string, onProgress?: ProgressCallback): Promise<Blob> {
if (onProgress) onProgress(20);
// Split text into paragraphs
const paragraphs = text.split('\n\n').filter(p => p.trim());
if (onProgress) onProgress(40);
// Create document with paragraphs
const doc = new Document({
sections: [
{
properties: {},
children: paragraphs.map((para) => {
return new Paragraph({
children: [new TextRun(para.trim())],
spacing: {
after: 200,
},
});
}),
},
],
});
if (onProgress) onProgress(70);
// Generate DOCX blob
const blob = await Packer.toBlob(doc);
if (onProgress) onProgress(100);
return blob;
}
/**
* Create DOCX from Markdown
*/
async function createDOCXFromMarkdown(markdown: string, onProgress?: ProgressCallback): Promise<Blob> {
if (onProgress) onProgress(10);
// Parse markdown and create structured document
const lines = markdown.split('\n');
const children: Paragraph[] = [];
let currentParagraph: string[] = [];
for (const line of lines) {
if (line.startsWith('# ')) {
// Heading 1
if (currentParagraph.length > 0) {
children.push(new Paragraph({
children: [new TextRun(currentParagraph.join(' '))],
spacing: { after: 200 },
}));
currentParagraph = [];
}
children.push(new Paragraph({
text: line.substring(2),
heading: HeadingLevel.HEADING_1,
spacing: { before: 240, after: 120 },
}));
} else if (line.startsWith('## ')) {
// Heading 2
if (currentParagraph.length > 0) {
children.push(new Paragraph({
children: [new TextRun(currentParagraph.join(' '))],
spacing: { after: 200 },
}));
currentParagraph = [];
}
children.push(new Paragraph({
text: line.substring(3),
heading: HeadingLevel.HEADING_2,
spacing: { before: 200, after: 100 },
}));
} else if (line.startsWith('### ')) {
// Heading 3
if (currentParagraph.length > 0) {
children.push(new Paragraph({
children: [new TextRun(currentParagraph.join(' '))],
spacing: { after: 200 },
}));
currentParagraph = [];
}
children.push(new Paragraph({
text: line.substring(4),
heading: HeadingLevel.HEADING_3,
spacing: { before: 160, after: 80 },
}));
} else if (line.trim() === '') {
// Empty line - paragraph break
if (currentParagraph.length > 0) {
children.push(new Paragraph({
children: [new TextRun(currentParagraph.join(' '))],
spacing: { after: 200 },
}));
currentParagraph = [];
}
} else {
// Regular text
currentParagraph.push(line);
}
}
// Add remaining paragraph
if (currentParagraph.length > 0) {
children.push(new Paragraph({
children: [new TextRun(currentParagraph.join(' '))],
spacing: { after: 200 },
}));
}
if (onProgress) onProgress(60);
const doc = new Document({
sections: [
{
properties: {},
children,
},
],
});
if (onProgress) onProgress(80);
const blob = await Packer.toBlob(doc);
if (onProgress) onProgress(100);
return blob;
}
/**
* Convert plain text to DOCX
*/
export async function textToDOCX(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
const text = await file.text();
if (onProgress) onProgress(20);
const blob = await createDOCXFromText(text, (progress) => {
if (onProgress) onProgress(20 + progress * 0.8);
});
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[DOCX Converter] Text to DOCX error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert text to DOCX',
duration: Date.now() - startTime,
};
}
}
/**
* Convert Markdown to DOCX
*/
export async function markdownToDOCX(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
const markdown = await file.text();
if (onProgress) onProgress(20);
const blob = await createDOCXFromMarkdown(markdown, (progress) => {
if (onProgress) onProgress(20 + progress * 0.8);
});
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[DOCX Converter] Markdown to DOCX error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert Markdown to DOCX',
duration: Date.now() - startTime,
};
}
}
/**
* Convert HTML to DOCX
*/
export async function htmlToDOCX(
file: File,
onProgress?: ProgressCallback
): Promise<ConversionResult> {
const startTime = Date.now();
try {
if (onProgress) onProgress(10);
const html = await file.text();
if (onProgress) onProgress(20);
// Strip HTML tags to get plain text
const text = html
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
.replace(/<[^>]*>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/\s+/g, ' ')
.trim();
if (onProgress) onProgress(50);
const blob = await createDOCXFromText(text, (progress) => {
if (onProgress) onProgress(50 + progress * 0.5);
});
return {
success: true,
blob,
duration: Date.now() - startTime,
};
} catch (error) {
console.error('[DOCX Converter] HTML to DOCX error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to convert HTML to DOCX',
duration: Date.now() - startTime,
};
}
}