feat: add comprehensive DOCX document support
- Install docx (v9.5.1) and mammoth (v1.11.0) packages - Create docxService.ts with full DOCX read/write functionality: - Extract text, HTML, and Markdown from DOCX files using mammoth - Generate DOCX files from Markdown with proper heading levels (H1-H3) - Generate DOCX files from HTML and plain text - Automatic paragraph formatting and spacing - Integrate DOCX conversions into pandocService.ts - Update README with DOCX support documentation - Add DOCX libraries to tech stack section Supported DOCX conversions: - DOCX → Text/HTML/Markdown - Markdown/HTML/Text → DOCX 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
443
lib/converters/docxService.ts
Normal file
443
lib/converters/docxService.ts
Normal file
@@ -0,0 +1,443 @@
|
||||
import { Document, Packer, Paragraph, TextRun, HeadingLevel } from 'docx';
|
||||
import type { ConversionOptions, ProgressCallback, ConversionResult } from '@/types/conversion';
|
||||
|
||||
/**
|
||||
* Extract text from DOCX file using mammoth
|
||||
*/
|
||||
export async function extractTextFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Dynamically import mammoth (client-side only)
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Read file as ArrayBuffer
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Extract text from DOCX
|
||||
const result = await mammoth.extractRawText({ arrayBuffer });
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return result.value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract HTML from DOCX file using mammoth
|
||||
*/
|
||||
export async function extractHTMLFromDOCX(file: File, onProgress?: ProgressCallback): Promise<string> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Dynamically import mammoth (client-side only)
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
if (onProgress) onProgress(30);
|
||||
|
||||
// Read file as ArrayBuffer
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
// Convert DOCX to HTML
|
||||
const result = await mammoth.convertToHtml({ arrayBuffer });
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return result.value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DOCX to plain text
|
||||
*/
|
||||
export async function docxToText(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const text = await extractTextFromDOCX(file, onProgress);
|
||||
const blob = new Blob([text], { type: 'text/plain' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] DOCX to text error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to extract text from DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DOCX to HTML
|
||||
*/
|
||||
export async function docxToHTML(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const html = await extractHTMLFromDOCX(file, (progress) => {
|
||||
if (onProgress) onProgress(progress * 0.9);
|
||||
});
|
||||
|
||||
// Wrap in full HTML document
|
||||
const fullHTML = `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Converted Document</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.6;
|
||||
max-width: 800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 1rem;
|
||||
color: #333;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
${html}
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const blob = new Blob([fullHTML], { type: 'text/html' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] DOCX to HTML error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert DOCX to HTML',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DOCX to Markdown
|
||||
*/
|
||||
export async function docxToMarkdown(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
// First convert to HTML
|
||||
const html = await extractHTMLFromDOCX(file, (progress) => {
|
||||
if (onProgress) onProgress(progress * 0.7);
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(80);
|
||||
|
||||
// Import turndown for HTML to Markdown
|
||||
const TurndownService = (await import('turndown')).default;
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
bulletListMarker: '-',
|
||||
});
|
||||
|
||||
const markdown = turndownService.turndown(html);
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
const blob = new Blob([markdown], { type: 'text/markdown' });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] DOCX to Markdown error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert DOCX to Markdown',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create DOCX from text content
|
||||
*/
|
||||
async function createDOCXFromText(text: string, onProgress?: ProgressCallback): Promise<Blob> {
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Split text into paragraphs
|
||||
const paragraphs = text.split('\n\n').filter(p => p.trim());
|
||||
|
||||
if (onProgress) onProgress(40);
|
||||
|
||||
// Create document with paragraphs
|
||||
const doc = new Document({
|
||||
sections: [
|
||||
{
|
||||
properties: {},
|
||||
children: paragraphs.map((para) => {
|
||||
return new Paragraph({
|
||||
children: [new TextRun(para.trim())],
|
||||
spacing: {
|
||||
after: 200,
|
||||
},
|
||||
});
|
||||
}),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(70);
|
||||
|
||||
// Generate DOCX blob
|
||||
const blob = await Packer.toBlob(doc);
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create DOCX from Markdown
|
||||
*/
|
||||
async function createDOCXFromMarkdown(markdown: string, onProgress?: ProgressCallback): Promise<Blob> {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
// Parse markdown and create structured document
|
||||
const lines = markdown.split('\n');
|
||||
const children: Paragraph[] = [];
|
||||
|
||||
let currentParagraph: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('# ')) {
|
||||
// Heading 1
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
children.push(new Paragraph({
|
||||
text: line.substring(2),
|
||||
heading: HeadingLevel.HEADING_1,
|
||||
spacing: { before: 240, after: 120 },
|
||||
}));
|
||||
} else if (line.startsWith('## ')) {
|
||||
// Heading 2
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
children.push(new Paragraph({
|
||||
text: line.substring(3),
|
||||
heading: HeadingLevel.HEADING_2,
|
||||
spacing: { before: 200, after: 100 },
|
||||
}));
|
||||
} else if (line.startsWith('### ')) {
|
||||
// Heading 3
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
children.push(new Paragraph({
|
||||
text: line.substring(4),
|
||||
heading: HeadingLevel.HEADING_3,
|
||||
spacing: { before: 160, after: 80 },
|
||||
}));
|
||||
} else if (line.trim() === '') {
|
||||
// Empty line - paragraph break
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
currentParagraph = [];
|
||||
}
|
||||
} else {
|
||||
// Regular text
|
||||
currentParagraph.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Add remaining paragraph
|
||||
if (currentParagraph.length > 0) {
|
||||
children.push(new Paragraph({
|
||||
children: [new TextRun(currentParagraph.join(' '))],
|
||||
spacing: { after: 200 },
|
||||
}));
|
||||
}
|
||||
|
||||
if (onProgress) onProgress(60);
|
||||
|
||||
const doc = new Document({
|
||||
sections: [
|
||||
{
|
||||
properties: {},
|
||||
children,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
if (onProgress) onProgress(80);
|
||||
|
||||
const blob = await Packer.toBlob(doc);
|
||||
|
||||
if (onProgress) onProgress(100);
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert plain text to DOCX
|
||||
*/
|
||||
export async function textToDOCX(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const text = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
const blob = await createDOCXFromText(text, (progress) => {
|
||||
if (onProgress) onProgress(20 + progress * 0.8);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] Text to DOCX error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert text to DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Markdown to DOCX
|
||||
*/
|
||||
export async function markdownToDOCX(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const markdown = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
const blob = await createDOCXFromMarkdown(markdown, (progress) => {
|
||||
if (onProgress) onProgress(20 + progress * 0.8);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] Markdown to DOCX error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert Markdown to DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to DOCX
|
||||
*/
|
||||
export async function htmlToDOCX(
|
||||
file: File,
|
||||
onProgress?: ProgressCallback
|
||||
): Promise<ConversionResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
if (onProgress) onProgress(10);
|
||||
|
||||
const html = await file.text();
|
||||
|
||||
if (onProgress) onProgress(20);
|
||||
|
||||
// Strip HTML tags to get plain text
|
||||
const text = html
|
||||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
||||
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (onProgress) onProgress(50);
|
||||
|
||||
const blob = await createDOCXFromText(text, (progress) => {
|
||||
if (onProgress) onProgress(50 + progress * 0.5);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
blob,
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[DOCX Converter] HTML to DOCX error:', error);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to convert HTML to DOCX',
|
||||
duration: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,14 @@ import {
|
||||
htmlToPDF,
|
||||
plainTextToPDF,
|
||||
} from './pdfService';
|
||||
import {
|
||||
docxToText,
|
||||
docxToHTML,
|
||||
docxToMarkdown,
|
||||
textToDOCX,
|
||||
markdownToDOCX,
|
||||
htmlToDOCX,
|
||||
} from './docxService';
|
||||
|
||||
// Import DOMPurify only on client side
|
||||
let DOMPurify: any;
|
||||
@@ -53,6 +61,20 @@ export async function convertWithPandoc(
|
||||
}
|
||||
}
|
||||
|
||||
// Handle DOCX conversions
|
||||
if (inputExt === 'docx') {
|
||||
// DOCX input
|
||||
if (outputFormat === 'txt') {
|
||||
return await docxToText(file, onProgress);
|
||||
} else if (outputFormat === 'html') {
|
||||
return await docxToHTML(file, onProgress);
|
||||
} else if (outputFormat === 'md' || outputFormat === 'markdown') {
|
||||
return await docxToMarkdown(file, onProgress);
|
||||
} else {
|
||||
throw new Error(`Conversion from DOCX to ${outputFormat} not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle conversions TO PDF
|
||||
if (outputFormat === 'pdf') {
|
||||
if (inputExt === 'md' || inputExt === 'markdown') {
|
||||
@@ -66,6 +88,19 @@ export async function convertWithPandoc(
|
||||
}
|
||||
}
|
||||
|
||||
// Handle conversions TO DOCX
|
||||
if (outputFormat === 'docx') {
|
||||
if (inputExt === 'md' || inputExt === 'markdown') {
|
||||
return await markdownToDOCX(file, onProgress);
|
||||
} else if (inputExt === 'html' || inputExt === 'htm') {
|
||||
return await htmlToDOCX(file, onProgress);
|
||||
} else if (inputExt === 'txt') {
|
||||
return await textToDOCX(file, onProgress);
|
||||
} else {
|
||||
throw new Error(`Conversion from ${inputExt} to DOCX not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
// Perform conversion based on input and output formats
|
||||
if (inputExt === 'md' || inputExt === 'markdown') {
|
||||
// Markdown input
|
||||
|
||||
Reference in New Issue
Block a user