Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,128 +1,413 @@
|
|
1 |
-
from flask import Flask, request, render_template_string, send_file
|
2 |
-
import markdown
|
3 |
-
import imgkit
|
4 |
import os
|
5 |
-
import
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
app = Flask(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def index():
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
if request.method == "POST" and markdown_text:
|
28 |
try:
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
if "download" in request.form:
|
62 |
-
if download_type == "html":
|
63 |
-
return send_file(
|
64 |
-
html_path,
|
65 |
-
as_attachment=True,
|
66 |
-
download_name="output.html",
|
67 |
-
mimetype="text/html"
|
68 |
-
)
|
69 |
-
else: # PNG
|
70 |
-
# Convert HTML to PNG using imgkit
|
71 |
-
png_path = os.path.join(TEMP_DIR, "output.png")
|
72 |
-
imgkit.from_string(full_html, png_path, options={"quiet": ""})
|
73 |
-
return send_file(
|
74 |
-
png_path,
|
75 |
-
as_attachment=True,
|
76 |
-
download_name="output.png",
|
77 |
-
mimetype="image/png"
|
78 |
-
)
|
79 |
|
80 |
except Exception as e:
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
<h1>Markdown to PNG/HTML Converter</h1>
|
101 |
-
<form method="post">
|
102 |
-
<textarea name="markdown_text" placeholder="Paste your Markdown here...">{{ markdown_text }}</textarea><br>
|
103 |
-
<label for="download_type">Output format:</label>
|
104 |
-
<select name="download_type">
|
105 |
-
<option value="png" {% if download_type == 'png' %}selected{% endif %}>PNG</option>
|
106 |
-
<option value="html" {% if download_type == 'html' %}selected{% endif %}>HTML</option>
|
107 |
-
</select><br>
|
108 |
-
<button type="submit">Generate Preview</button>
|
109 |
-
{% if download_available %}
|
110 |
-
<button type="submit" name="download" value="true" class="download-btn">Download {{ download_type.upper() }}</button>
|
111 |
-
{% endif %}
|
112 |
-
</form>
|
113 |
-
{% if error_message %}
|
114 |
-
<p class="error">{{ error_message }}</p>
|
115 |
-
{% endif %}
|
116 |
-
{% if preview_html %}
|
117 |
-
<h2>Preview</h2>
|
118 |
-
<div class="preview">
|
119 |
-
{{ preview_html | safe }}
|
120 |
-
</div>
|
121 |
-
{% endif %}
|
122 |
-
</body>
|
123 |
-
</html>
|
124 |
-
""", preview_html=preview_html, download_available=download_available,
|
125 |
-
download_type=download_type, error_message=error_message, markdown_text=markdown_text)
|
126 |
-
|
127 |
-
if __name__ == "__main__":
|
128 |
-
app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import io
|
3 |
+
import re
|
4 |
+
import logging
|
5 |
+
import subprocess
|
6 |
+
from datetime import datetime
|
7 |
+
import urllib.parse
|
8 |
+
import tempfile
|
9 |
+
import json # For streaming JSON messages
|
10 |
+
import time # For gevent.sleep
|
11 |
|
12 |
+
from flask import Flask, request, render_template, Response, stream_with_context
|
13 |
+
from werkzeug.utils import secure_filename
|
14 |
+
|
15 |
+
# Ensure gevent is imported and monkey patched if needed for other libraries
|
16 |
+
# from gevent import monkey
|
17 |
+
# monkey.patch_all() # Apply this early if you suspect issues with other libs
|
18 |
+
|
19 |
+
import requests # For requests.exceptions.HTTPError
|
20 |
+
from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
|
21 |
+
|
22 |
+
import pdfplumber
|
23 |
+
import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
|
24 |
+
from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
|
25 |
+
# from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
|
26 |
+
|
27 |
+
import pytesseract
|
28 |
+
from PIL import Image
|
29 |
+
from huggingface_hub import HfApi, create_repo
|
30 |
+
|
31 |
+
# --- Flask App Initialization ---
|
32 |
app = Flask(__name__)
|
33 |
+
app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
|
34 |
+
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50 MB limit for uploads, adjust as needed
|
35 |
+
|
36 |
+
# --- Logging Configuration ---
|
37 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
38 |
+
logger = logging.getLogger(__name__)
|
39 |
+
|
40 |
+
# --- Hugging Face Configuration ---
|
41 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
42 |
+
HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
|
43 |
+
hf_api = HfApi()
|
44 |
+
|
45 |
+
# --- Helper to yield messages for streaming ---
|
46 |
+
def yield_message(type, data):
|
47 |
+
"""Helper to format messages as JSON strings for streaming."""
|
48 |
+
return json.dumps({"type": type, **data}) + "\n"
|
49 |
+
|
50 |
+
# --- PDF Processing Helper Functions (Adapted for Streaming) ---
|
51 |
+
|
52 |
+
def check_poppler():
|
53 |
+
try:
|
54 |
+
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
|
55 |
+
version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
|
56 |
+
if version_info_log:
|
57 |
+
logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
|
58 |
+
else:
|
59 |
+
logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
|
60 |
+
return True
|
61 |
+
except FileNotFoundError:
|
62 |
+
logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
|
63 |
+
return False
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
|
66 |
+
return False
|
67 |
+
|
68 |
+
def ensure_hf_dataset():
|
69 |
+
if not HF_TOKEN:
|
70 |
+
msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
|
71 |
+
logger.warning(msg)
|
72 |
+
return "Error: " + msg
|
73 |
+
try:
|
74 |
+
repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
75 |
+
logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
|
76 |
+
return repo_id_obj.repo_id
|
77 |
+
except RequestsHTTPError as e:
|
78 |
+
if e.response is not None and e.response.status_code == 409:
|
79 |
+
logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
|
80 |
+
try:
|
81 |
+
user_info = hf_api.whoami(token=HF_TOKEN)
|
82 |
+
namespace = user_info.get('name') if user_info else None
|
83 |
+
if namespace:
|
84 |
+
return f"{namespace}/{HF_DATASET_REPO_NAME}"
|
85 |
+
else:
|
86 |
+
logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
|
87 |
+
return HF_DATASET_REPO_NAME
|
88 |
+
except Exception as whoami_e:
|
89 |
+
logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
|
90 |
+
return HF_DATASET_REPO_NAME
|
91 |
+
else:
|
92 |
+
status_code = e.response.status_code if e.response is not None else "Unknown"
|
93 |
+
logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
|
94 |
+
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
|
95 |
+
except Exception as e:
|
96 |
+
logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
|
97 |
+
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
|
98 |
+
|
99 |
+
|
100 |
+
def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
|
101 |
+
repo_id_or_error = ensure_hf_dataset()
|
102 |
+
if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
|
103 |
+
return repo_id_or_error
|
104 |
+
|
105 |
+
repo_id = repo_id_or_error
|
106 |
+
temp_image_path = None
|
107 |
+
try:
|
108 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
109 |
+
repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
|
110 |
+
|
111 |
+
# Ensure UPLOAD_FOLDER exists before writing temp file
|
112 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
113 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
|
114 |
+
temp_image_path = tmp_file.name
|
115 |
+
image_pil.save(temp_image_path, format="PNG")
|
116 |
+
|
117 |
+
logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
|
118 |
+
file_url = hf_api.upload_file(
|
119 |
+
path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
|
120 |
+
repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
|
121 |
+
)
|
122 |
+
logger.info(f"Successfully uploaded image: {file_url}")
|
123 |
+
return file_url
|
124 |
+
except Exception as e:
|
125 |
+
logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
|
126 |
+
return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
|
127 |
+
finally:
|
128 |
+
if temp_image_path and os.path.exists(temp_image_path):
|
129 |
+
try: os.remove(temp_image_path)
|
130 |
+
except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
|
131 |
+
|
132 |
+
|
133 |
+
def format_page_text_to_markdown_chunk(page_text_content):
|
134 |
+
chunk_md = ""
|
135 |
+
page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
|
136 |
+
lines = page_text_content.split('\n')
|
137 |
+
is_in_list = False
|
138 |
+
for line_text in lines:
|
139 |
+
line_stripped = line_text.strip()
|
140 |
+
if not line_stripped:
|
141 |
+
chunk_md += "\n"
|
142 |
+
is_in_list = False
|
143 |
+
continue
|
144 |
+
list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
|
145 |
+
is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
|
146 |
+
if is_heading_candidate and not list_match:
|
147 |
+
chunk_md += f"## {line_stripped}\n\n"
|
148 |
+
is_in_list = False
|
149 |
+
elif list_match:
|
150 |
+
list_item_text = list_match.group(1)
|
151 |
+
chunk_md += f"- {list_item_text}\n"
|
152 |
+
is_in_list = True
|
153 |
+
else:
|
154 |
+
if is_in_list: chunk_md += "\n"
|
155 |
+
chunk_md += f"{line_text}\n\n"
|
156 |
+
is_in_list = False
|
157 |
+
return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
|
158 |
+
|
159 |
+
|
160 |
+
# --- Main PDF Processing Logic (Generator Function for Streaming) ---
|
161 |
+
|
162 |
+
def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
163 |
+
try:
|
164 |
+
yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
|
165 |
+
time.sleep(0.01)
|
166 |
|
167 |
+
yield yield_message("status", {"message": "Opening PDF for text extraction..."})
|
168 |
+
time.sleep(0.01)
|
169 |
|
170 |
+
source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
|
171 |
+
pdf_input_source_path_or_url.startswith(('http://', 'https://'))
|
172 |
+
|
173 |
+
pdf_handle_for_text = None
|
174 |
+
pdf_bytes_for_images = None
|
175 |
|
176 |
+
if source_is_url:
|
177 |
+
try:
|
178 |
+
response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
|
179 |
+
response.raise_for_status()
|
180 |
+
pdf_bytes_for_images = response.content
|
181 |
+
pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
|
182 |
+
yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
|
183 |
+
time.sleep(0.01)
|
184 |
+
except RequestsHTTPError as e:
|
185 |
+
logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
|
186 |
+
yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
|
187 |
+
return
|
188 |
+
except requests.RequestException as e:
|
189 |
+
logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
|
190 |
+
yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
|
191 |
+
return
|
192 |
+
else:
|
193 |
+
pdf_handle_for_text = pdf_input_source_path_or_url
|
194 |
+
|
195 |
+
total_text_pages = 0
|
196 |
+
try:
|
197 |
+
with pdfplumber.open(pdf_handle_for_text) as pdf:
|
198 |
+
total_text_pages = len(pdf.pages)
|
199 |
+
yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
|
200 |
+
time.sleep(0.01)
|
201 |
+
|
202 |
+
for i, page in enumerate(pdf.pages):
|
203 |
+
yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
|
204 |
+
time.sleep(0.01)
|
205 |
+
|
206 |
+
page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
|
207 |
+
|
208 |
+
# Removed table extraction logic here
|
209 |
+
# page_tables_md = "" # No longer needed
|
210 |
+
# tables = page.extract_tables() # No longer needed
|
211 |
+
# if tables: # No longer needed
|
212 |
+
# ... (table processing code removed) ...
|
213 |
+
|
214 |
+
formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
|
215 |
+
|
216 |
+
yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
|
217 |
+
# if page_tables_md: # No longer needed, as page_tables_md is not created
|
218 |
+
# yield yield_message("markdown_chunk", {"content": page_tables_md})
|
219 |
+
time.sleep(0.01)
|
220 |
+
except Exception as e:
|
221 |
+
logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
|
222 |
+
yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
|
223 |
+
|
224 |
+
if not check_poppler():
|
225 |
+
yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
|
226 |
+
else:
|
227 |
+
yield yield_message("status", {"message": "Starting image extraction..."})
|
228 |
+
yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
|
229 |
+
if not HF_TOKEN:
|
230 |
+
yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
|
231 |
+
|
232 |
+
time.sleep(0.01)
|
233 |
+
extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
|
234 |
+
try:
|
235 |
+
image_source_for_convert = None
|
236 |
+
if source_is_url and pdf_bytes_for_images:
|
237 |
+
image_source_for_convert = pdf_bytes_for_images
|
238 |
+
logger.info("Using downloaded bytes for image conversion.")
|
239 |
+
elif not source_is_url:
|
240 |
+
image_source_for_convert = pdf_input_source_path_or_url
|
241 |
+
logger.info("Using local file path for image conversion.")
|
242 |
+
|
243 |
+
if image_source_for_convert:
|
244 |
+
try:
|
245 |
+
pdf_info = None
|
246 |
+
if isinstance(image_source_for_convert, bytes):
|
247 |
+
pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
|
248 |
+
else:
|
249 |
+
pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
|
250 |
+
|
251 |
+
num_image_pages = pdf_info.get("Pages", 0)
|
252 |
+
yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
|
253 |
+
|
254 |
+
batch_size = 1
|
255 |
+
for page_idx_start in range(1, num_image_pages + 1, batch_size):
|
256 |
+
page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
|
257 |
+
yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
|
258 |
+
time.sleep(0.01)
|
259 |
+
|
260 |
+
page_images_pil = []
|
261 |
+
if isinstance(image_source_for_convert, bytes):
|
262 |
+
page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
263 |
+
else:
|
264 |
+
page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
265 |
+
|
266 |
+
for img_idx_in_batch, img_pil in enumerate(page_images_pil):
|
267 |
+
extracted_pil_images_overall_count += 1
|
268 |
+
current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
|
269 |
+
page_num_for_log = f"pdfpage_{current_pdf_page_num}"
|
270 |
+
|
271 |
+
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
|
272 |
+
time.sleep(0.01)
|
273 |
+
|
274 |
+
ocr_text = ""
|
275 |
+
try:
|
276 |
+
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
277 |
+
if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
|
278 |
+
except Exception as ocr_e:
|
279 |
+
logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
|
280 |
+
ocr_text = f"OCR failed: {str(ocr_e)}"
|
281 |
+
|
282 |
+
image_md_chunk = ""
|
283 |
+
if HF_TOKEN:
|
284 |
+
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
|
285 |
+
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
286 |
+
image_md_chunk += f"\n"
|
287 |
+
yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
|
288 |
+
else:
|
289 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
|
290 |
+
yield yield_message("error", {"message": f"Failed to upload image {extracted_pil_images_overall_count}: {str(image_url_or_error)}"})
|
291 |
+
else:
|
292 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
|
293 |
+
|
294 |
+
if ocr_text:
|
295 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
296 |
+
|
297 |
+
yield yield_message("image_md", {"content": image_md_chunk})
|
298 |
+
time.sleep(0.01)
|
299 |
+
except Exception as e_img_info:
|
300 |
+
logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
|
301 |
+
yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
|
302 |
+
# Fallback to bulk conversion
|
303 |
+
bulk_images_pil = []
|
304 |
+
if isinstance(image_source_for_convert, bytes):
|
305 |
+
bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
|
306 |
+
else:
|
307 |
+
bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
|
308 |
+
|
309 |
+
yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
|
310 |
+
for i, img_pil in enumerate(bulk_images_pil):
|
311 |
+
extracted_pil_images_overall_count +=1
|
312 |
+
page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
|
313 |
+
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
|
314 |
+
ocr_text = ""
|
315 |
+
try: ocr_text = pytesseract.image_to_string(img_pil).strip()
|
316 |
+
except Exception as e: ocr_text = f"OCR Error: {e}"
|
317 |
+
|
318 |
+
image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
|
319 |
+
if HF_TOKEN:
|
320 |
+
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
|
321 |
+
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
322 |
+
image_md_chunk = f"\n"
|
323 |
+
else:
|
324 |
+
image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
|
325 |
+
if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
326 |
+
else: image_md_chunk += "\n"
|
327 |
+
yield yield_message("image_md", {"content": image_md_chunk})
|
328 |
+
time.sleep(0.01)
|
329 |
+
|
330 |
+
else:
|
331 |
+
yield yield_message("status", {"message": "No valid source for image extraction."})
|
332 |
+
|
333 |
+
except Exception as e:
|
334 |
+
logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
|
335 |
+
yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
|
336 |
+
|
337 |
+
yield yield_message("final_status", {"message": "All processing stages complete."})
|
338 |
+
|
339 |
+
except Exception as e:
|
340 |
+
logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
|
341 |
+
yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})
|
342 |
+
|
343 |
+
|
344 |
+
# --- Flask Routes ---
|
345 |
+
|
346 |
+
@app.route('/', methods=['GET'])
|
347 |
def index():
|
348 |
+
return render_template('index.html')
|
349 |
+
|
350 |
+
@app.route('/process-stream', methods=['POST'])
|
351 |
+
def process_pdf_stream():
|
352 |
+
pdf_file = request.files.get('pdf_file')
|
353 |
+
pdf_url = request.form.get('pdf_url', '').strip()
|
354 |
+
|
355 |
+
outer_temp_pdf_path = None
|
356 |
+
|
357 |
+
def stream_processor():
|
358 |
+
nonlocal outer_temp_pdf_path
|
359 |
+
pdf_input_source_for_generator = None
|
360 |
|
|
|
361 |
try:
|
362 |
+
if pdf_file and pdf_file.filename:
|
363 |
+
if not pdf_file.filename.lower().endswith('.pdf'):
|
364 |
+
yield yield_message("error", {"message": "Uploaded file is not a PDF."})
|
365 |
+
return
|
366 |
+
|
367 |
+
filename = secure_filename(pdf_file.filename)
|
368 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
369 |
+
fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
|
370 |
+
os.close(fd)
|
371 |
+
pdf_file.save(temp_path)
|
372 |
+
outer_temp_pdf_path = temp_path
|
373 |
+
logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
|
374 |
+
pdf_input_source_for_generator = outer_temp_pdf_path
|
375 |
+
yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
|
376 |
+
time.sleep(0.01)
|
377 |
+
|
378 |
+
elif pdf_url:
|
379 |
+
unquoted_url = urllib.parse.unquote(pdf_url)
|
380 |
+
if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
|
381 |
+
yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
|
382 |
+
return
|
383 |
+
|
384 |
+
pdf_input_source_for_generator = unquoted_url
|
385 |
+
yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
|
386 |
+
time.sleep(0.01)
|
387 |
+
else:
|
388 |
+
yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
|
389 |
+
return
|
390 |
+
|
391 |
+
for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
|
392 |
+
yield message_part
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
except Exception as e:
|
395 |
+
logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
|
396 |
+
yield yield_message("error", {"message": f"Setup error: {str(e)}"})
|
397 |
+
finally:
|
398 |
+
if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
|
399 |
+
try:
|
400 |
+
os.remove(outer_temp_pdf_path)
|
401 |
+
logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
|
402 |
+
except OSError as ose:
|
403 |
+
logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
|
404 |
+
|
405 |
+
return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')
|
406 |
+
|
407 |
+
|
408 |
+
# --- Main Execution ---
|
409 |
+
if __name__ == '__main__':
|
410 |
+
if not check_poppler():
|
411 |
+
logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
|
412 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
413 |
+
app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|