- Install docx (v9.5.1) and mammoth (v1.11.0) packages - Create docxService.ts with full DOCX read/write functionality: - Extract text, HTML, and Markdown from DOCX files using mammoth - Generate DOCX files from Markdown with proper heading levels (H1-H3) - Generate DOCX files from HTML and plain text - Automatic paragraph formatting and spacing - Integrate DOCX conversions into pandocService.ts - Update README with DOCX support documentation - Add DOCX libraries to tech stack section Supported DOCX conversions: - DOCX → Text/HTML/Markdown - Markdown/HTML/Text → DOCX 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
444 lines
10 KiB
TypeScript
444 lines
10 KiB
TypeScript
import { Document, Packer, Paragraph, TextRun, HeadingLevel } from 'docx';
|
|
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
|
|
|
/**
|
|
* Extract text from DOCX file using mammoth
|
|
*/
|
|
export async function extractTextFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Dynamically import mammoth (client-side only)
|
|
const mammoth = await import('mammoth');
|
|
|
|
if (onProgress) onProgress(30);
|
|
|
|
// Read file as ArrayBuffer
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
|
|
if (onProgress) onProgress(50);
|
|
|
|
// Extract text from DOCX
|
|
const result = await mammoth.extractRawText({ arrayBuffer });
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
return result.value;
|
|
}
|
|
|
|
/**
|
|
* Extract HTML from DOCX file using mammoth
|
|
*/
|
|
export async function extractHTMLFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Dynamically import mammoth (client-side only)
|
|
const mammoth = await import('mammoth');
|
|
|
|
if (onProgress) onProgress(30);
|
|
|
|
// Read file as ArrayBuffer
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
|
|
if (onProgress) onProgress(50);
|
|
|
|
// Convert DOCX to HTML
|
|
const result = await mammoth.convertToHtml({ arrayBuffer });
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
return result.value;
|
|
}
|
|
|
|
/**
|
|
* Convert DOCX to plain text
|
|
*/
|
|
export async function docxToText(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const text = await extractTextFromDOCX(file, onProgress);
|
|
const blob = new Blob([text], { type: 'text/plain' });
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[DOCX Converter] DOCX to text error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to extract text from DOCX',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert DOCX to HTML
|
|
*/
|
|
export async function docxToHTML(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const html = await extractHTMLFromDOCX(file, (progress) => {
|
|
if (onProgress) onProgress(progress * 0.9);
|
|
});
|
|
|
|
// Wrap in full HTML document
|
|
const fullHTML = `<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Converted Document</title>
|
|
<style>
|
|
body {
|
|
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
line-height: 1.6;
|
|
max-width: 800px;
|
|
margin: 2rem auto;
|
|
padding: 0 1rem;
|
|
color: #333;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
${html}
|
|
</body>
|
|
</html>`;
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
const blob = new Blob([fullHTML], { type: 'text/html' });
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[DOCX Converter] DOCX to HTML error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert DOCX to HTML',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert DOCX to Markdown
|
|
*/
|
|
export async function docxToMarkdown(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
// First convert to HTML
|
|
const html = await extractHTMLFromDOCX(file, (progress) => {
|
|
if (onProgress) onProgress(progress * 0.7);
|
|
});
|
|
|
|
if (onProgress) onProgress(80);
|
|
|
|
// Import turndown for HTML to Markdown
|
|
const TurndownService = (await import('turndown')).default;
|
|
const turndownService = new TurndownService({
|
|
headingStyle: 'atx',
|
|
codeBlockStyle: 'fenced',
|
|
bulletListMarker: '-',
|
|
});
|
|
|
|
const markdown = turndownService.turndown(html);
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
const blob = new Blob([markdown], { type: 'text/markdown' });
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[DOCX Converter] DOCX to Markdown error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert DOCX to Markdown',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create DOCX from text content
|
|
*/
|
|
async function createDOCXFromText(text: string, onProgress?: ProgressCallback): Promise<Blob> {
|
|
if (onProgress) onProgress(20);
|
|
|
|
// Split text into paragraphs
|
|
const paragraphs = text.split('\n\n').filter(p => p.trim());
|
|
|
|
if (onProgress) onProgress(40);
|
|
|
|
// Create document with paragraphs
|
|
const doc = new Document({
|
|
sections: [
|
|
{
|
|
properties: {},
|
|
children: paragraphs.map((para) => {
|
|
return new Paragraph({
|
|
children: [new TextRun(para.trim())],
|
|
spacing: {
|
|
after: 200,
|
|
},
|
|
});
|
|
}),
|
|
},
|
|
],
|
|
});
|
|
|
|
if (onProgress) onProgress(70);
|
|
|
|
// Generate DOCX blob
|
|
const blob = await Packer.toBlob(doc);
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
return blob;
|
|
}
|
|
|
|
/**
|
|
* Create DOCX from Markdown
|
|
*/
|
|
async function createDOCXFromMarkdown(markdown: string, onProgress?: ProgressCallback): Promise<Blob> {
|
|
if (onProgress) onProgress(10);
|
|
|
|
// Parse markdown and create structured document
|
|
const lines = markdown.split('\n');
|
|
const children: Paragraph[] = [];
|
|
|
|
let currentParagraph: string[] = [];
|
|
|
|
for (const line of lines) {
|
|
if (line.startsWith('# ')) {
|
|
// Heading 1
|
|
if (currentParagraph.length > 0) {
|
|
children.push(new Paragraph({
|
|
children: [new TextRun(currentParagraph.join(' '))],
|
|
spacing: { after: 200 },
|
|
}));
|
|
currentParagraph = [];
|
|
}
|
|
children.push(new Paragraph({
|
|
text: line.substring(2),
|
|
heading: HeadingLevel.HEADING_1,
|
|
spacing: { before: 240, after: 120 },
|
|
}));
|
|
} else if (line.startsWith('## ')) {
|
|
// Heading 2
|
|
if (currentParagraph.length > 0) {
|
|
children.push(new Paragraph({
|
|
children: [new TextRun(currentParagraph.join(' '))],
|
|
spacing: { after: 200 },
|
|
}));
|
|
currentParagraph = [];
|
|
}
|
|
children.push(new Paragraph({
|
|
text: line.substring(3),
|
|
heading: HeadingLevel.HEADING_2,
|
|
spacing: { before: 200, after: 100 },
|
|
}));
|
|
} else if (line.startsWith('### ')) {
|
|
// Heading 3
|
|
if (currentParagraph.length > 0) {
|
|
children.push(new Paragraph({
|
|
children: [new TextRun(currentParagraph.join(' '))],
|
|
spacing: { after: 200 },
|
|
}));
|
|
currentParagraph = [];
|
|
}
|
|
children.push(new Paragraph({
|
|
text: line.substring(4),
|
|
heading: HeadingLevel.HEADING_3,
|
|
spacing: { before: 160, after: 80 },
|
|
}));
|
|
} else if (line.trim() === '') {
|
|
// Empty line - paragraph break
|
|
if (currentParagraph.length > 0) {
|
|
children.push(new Paragraph({
|
|
children: [new TextRun(currentParagraph.join(' '))],
|
|
spacing: { after: 200 },
|
|
}));
|
|
currentParagraph = [];
|
|
}
|
|
} else {
|
|
// Regular text
|
|
currentParagraph.push(line);
|
|
}
|
|
}
|
|
|
|
// Add remaining paragraph
|
|
if (currentParagraph.length > 0) {
|
|
children.push(new Paragraph({
|
|
children: [new TextRun(currentParagraph.join(' '))],
|
|
spacing: { after: 200 },
|
|
}));
|
|
}
|
|
|
|
if (onProgress) onProgress(60);
|
|
|
|
const doc = new Document({
|
|
sections: [
|
|
{
|
|
properties: {},
|
|
children,
|
|
},
|
|
],
|
|
});
|
|
|
|
if (onProgress) onProgress(80);
|
|
|
|
const blob = await Packer.toBlob(doc);
|
|
|
|
if (onProgress) onProgress(100);
|
|
|
|
return blob;
|
|
}
|
|
|
|
/**
|
|
* Convert plain text to DOCX
|
|
*/
|
|
export async function textToDOCX(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
const text = await file.text();
|
|
|
|
if (onProgress) onProgress(20);
|
|
|
|
const blob = await createDOCXFromText(text, (progress) => {
|
|
if (onProgress) onProgress(20 + progress * 0.8);
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[DOCX Converter] Text to DOCX error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert text to DOCX',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert Markdown to DOCX
|
|
*/
|
|
export async function markdownToDOCX(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
const markdown = await file.text();
|
|
|
|
if (onProgress) onProgress(20);
|
|
|
|
const blob = await createDOCXFromMarkdown(markdown, (progress) => {
|
|
if (onProgress) onProgress(20 + progress * 0.8);
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[DOCX Converter] Markdown to DOCX error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert Markdown to DOCX',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert HTML to DOCX
|
|
*/
|
|
export async function htmlToDOCX(
|
|
file: File,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ConversionResult> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
if (onProgress) onProgress(10);
|
|
|
|
const html = await file.text();
|
|
|
|
if (onProgress) onProgress(20);
|
|
|
|
// Strip HTML tags to get plain text
|
|
const text = html
|
|
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
|
.replace(/<[^>]*>/g, ' ')
|
|
.replace(/ /g, ' ')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/&/g, '&')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
|
|
if (onProgress) onProgress(50);
|
|
|
|
const blob = await createDOCXFromText(text, (progress) => {
|
|
if (onProgress) onProgress(50 + progress * 0.5);
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
blob,
|
|
duration: Date.now() - startTime,
|
|
};
|
|
} catch (error) {
|
|
console.error('[DOCX Converter] HTML to DOCX error:', error);
|
|
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to convert HTML to DOCX',
|
|
duration: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|