Spaces:
Sleeping
Sleeping
<html lang="en"> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> | |
<title>PDF to Markdown Converter (Streaming)</title> | |
<style> | |
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; margin: 0; background-color: #f0f2f5; color: #1c1e21; line-height: 1.5; } | |
.navbar { background-color: #1877f2; padding: 10px 20px; color: white; text-align: center; } | |
.navbar h1 { margin: 0; font-size: 1.8em; } | |
.container { max-width: 800px; margin: 20px auto; background-color: #fff; padding: 25px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1), 0 8px 16px rgba(0,0,0,0.1); } | |
p { margin-bottom: 1em; } | |
label { display: block; margin-top: 15px; margin-bottom: 5px; font-weight: 600; color: #4b4f56; } | |
input[type="file"], input[type="text"] { width: calc(100% - 22px); padding: 10px; margin-top: 5px; border: 1px solid #dddfe2; border-radius: 6px; font-size: 1em; } | |
input[type="file"] { padding: 7px; } | |
#submitBtn { background-color: #1877f2; color: white; padding: 10px 20px; border: none; border-radius: 6px; cursor: pointer; margin-top: 25px; font-size: 1.1em; font-weight: bold; } | |
#submitBtn:hover { background-color: #166fe5; } | |
#submitBtn:disabled { background-color: #a0a0a0; cursor: not-allowed; } | |
.message { margin-top: 20px; padding: 12px; border-radius: 6px; font-size: 0.95em; } | |
.error { background-color: #f8d7da; border: 1px solid #f5c2c7; color: #842029; } | |
#statusArea { background-color: #e7f3ff; border: 1px solid #cfe2ff; color: #055160; margin-top: 20px; padding: 10px; min-height: 50px; border-radius: 6px; } | |
#statusArea p { margin: 5px 0; } | |
#markdownOutput { background-color: #f5f6f7; padding: 15px; border: 1px solid #e0e0e0; border-radius: 6px; white-space: pre-wrap; word-wrap: break-word; margin-top: 20px; font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace; font-size: 0.9em; line-height: 1.6; min-height: 100px; } | |
.or-separator { text-align: center; margin: 20px 0; font-weight: bold; color: #606770; } | |
.form-actions { text-align: center; } | |
.footer { text-align: center; margin-top: 30px; font-size: 0.85em; color: #606770; } | |
</style> | |
</head> | |
<body> | |
<div class="navbar"> | |
<h1>PDF to Markdown Converter (Streaming)</h1> | |
</div> | |
<div class="container"> | |
<p>Upload a PDF file or provide a URL to convert it to Markdown. Progress will be streamed.</p> | |
<div id="globalError" class="message error" style="display:none;"></div> | |
<form id="pdfForm"> | |
<div> | |
<label for="pdf_file">Upload PDF File:</label> | |
<input type="file" name="pdf_file" id="pdf_file" accept=".pdf"> | |
</div> | |
<div class="or-separator">OR</div> | |
<div> | |
<label for="pdf_url">Enter PDF URL:</label> | |
<input type="text" name="pdf_url" id="pdf_url" placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf"> | |
</div> | |
<div class="form-actions"> | |
<button type="button" id="submitBtn">Convert to Markdown</button> | |
</div> | |
</form> | |
<h2>Processing Status:</h2> | |
<div id="statusArea"> | |
<p>Waiting for input...</p> | |
</div> | |
<h2>Markdown Output:</h2> | |
<pre id="markdownOutput">Output will appear here...</pre> | |
</div> | |
<div class="footer"> | |
<p>Powered by Flask, Poppler, Tesseract, and Hugging Face.</p> | |
</div> | |
<script> | |
const form = document.getElementById('pdfForm'); | |
const submitBtn = document.getElementById('submitBtn'); | |
const statusArea = document.getElementById('statusArea'); | |
const markdownOutput = document.getElementById('markdownOutput'); | |
const globalError = document.getElementById('globalError'); | |
submitBtn.addEventListener('click', async function(event) { | |
event.preventDefault(); | |
submitBtn.disabled = true; | |
statusArea.innerHTML = '<p>Starting processing...</p>'; | |
markdownOutput.textContent = 'Processing...'; | |
globalError.style.display = 'none'; | |
const formData = new FormData(form); | |
try { | |
const response = await fetch("{{ url_for('process_pdf_stream') }}", { | |
method: 'POST', | |
body: formData, | |
}); | |
if (!response.ok) { | |
// Handle initial HTTP errors before streaming starts (e.g., 400, 500 from Flask before yield) | |
const errorText = await response.text(); | |
throw new Error(`Server error: ${response.status} ${response.statusText}. ${errorText}`); | |
} | |
// Process the streamed response | |
const reader = response.body.getReader(); | |
const decoder = new TextDecoder(); | |
markdownOutput.textContent = ''; // Clear previous output | |
while (true) { | |
const { value, done } = await reader.read(); | |
if (done) { | |
statusArea.innerHTML += '<p><strong>Processing complete.</strong></p>'; | |
break; | |
} | |
const chunk = decoder.decode(value, { stream: true }); | |
// Expecting JSON objects: {"type": "status", "message": "..."} or {"type": "markdown", "content": "..."} or {"type": "error", "message": "..."} | |
// Simple split for potentially multiple JSON objects in one chunk | |
chunk.split('\n').forEach(line => { | |
if (line.trim() === '') return; | |
try { | |
const data = JSON.parse(line); | |
if (data.type === 'status') { | |
const p = document.createElement('p'); | |
p.textContent = data.message; | |
statusArea.appendChild(p); | |
statusArea.scrollTop = statusArea.scrollHeight; // Auto-scroll | |
} else if (data.type === 'markdown_chunk') { | |
markdownOutput.textContent += data.content; | |
} else if (data.type === 'markdown_replace') { | |
markdownOutput.textContent = data.content; // For initial title or full rewrite | |
} else if (data.type === 'image_md') { | |
markdownOutput.textContent += data.content; | |
} else if (data.type === 'error') { | |
const p = document.createElement('p'); | |
p.style.color = 'red'; | |
p.textContent = 'ERROR: ' + data.message; | |
statusArea.appendChild(p); | |
globalError.textContent = 'An error occurred: ' + data.message; | |
globalError.style.display = 'block'; | |
} else if (data.type === 'final_status') { | |
statusArea.innerHTML += `<p><strong>${data.message}</strong></p>`; | |
} | |
} catch (e) { | |
console.warn('Failed to parse JSON chunk:', line, e); | |
// Might be raw text for debugging or incomplete JSON | |
// statusArea.innerHTML += `<p>Raw chunk: ${line}</p>`; | |
} | |
}); | |
} | |
} catch (error) { | |
console.error('Fetch error:', error); | |
statusArea.innerHTML = `<p style="color:red;"><strong>Processing failed:</strong> ${error.message}</p>`; | |
markdownOutput.textContent = 'Error occurred.'; | |
globalError.textContent = `An error occurred during the request: ${error.message}`; | |
globalError.style.display = 'block'; | |
} finally { | |
submitBtn.disabled = false; | |
} | |
}); | |
</script> | |
</body> | |
</html> |