Update app.py
Browse filesAdded more robust label detection.
app.py
CHANGED
@@ -2,19 +2,35 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import fitz # PyMuPDF
|
4 |
import os
|
|
|
5 |
from huggingface_hub import HfApi
|
6 |
from huggingface_hub.utils import HfHubHTTPError
|
7 |
import time
|
8 |
|
9 |
-
def
|
10 |
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
11 |
doc = fitz.open(pdf_path)
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
total_pages = len(doc)
|
15 |
max_iterations = total_pages * 2 # To prevent infinite loops
|
16 |
iteration_count = 0
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
for page_num, page in enumerate(doc):
|
19 |
iteration_count += 1
|
20 |
if iteration_count > max_iterations:
|
@@ -27,23 +43,85 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
27 |
for block in blocks:
|
28 |
if "lines" in block:
|
29 |
text = ""
|
|
|
30 |
for line in block["lines"]:
|
31 |
for span in line["spans"]:
|
32 |
text += span["text"] + " "
|
|
|
|
|
33 |
|
34 |
text = text.strip()
|
35 |
|
36 |
-
#
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
print(f"β
Finished Processing PDF: {os.path.basename(pdf_path)}")
|
46 |
-
return
|
|
|
|
|
|
|
47 |
|
48 |
def upload_with_progress(file_path, repo_id, token, progress):
|
49 |
"""
|
@@ -87,20 +165,14 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
|
|
87 |
if progress is not None:
|
88 |
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
89 |
|
90 |
-
# β
Step 1: Process PDF
|
91 |
-
extracted_data =
|
92 |
-
|
93 |
-
all_data.append({
|
94 |
-
'filename': os.path.basename(pdf_file.name),
|
95 |
-
'page_num': item['page_num'],
|
96 |
-
'text': item['text'],
|
97 |
-
'is_header': item['is_header']
|
98 |
-
})
|
99 |
|
100 |
print("π‘ Converting Processed Data to Parquet")
|
101 |
# β
Step 2: Convert to Parquet
|
102 |
df = pd.DataFrame(all_data)
|
103 |
-
parquet_file = '
|
104 |
|
105 |
try:
|
106 |
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
@@ -135,8 +207,9 @@ iface = gr.Interface(
|
|
135 |
gr.File(label="Download Parquet File"),
|
136 |
gr.Textbox(label="Status")
|
137 |
],
|
138 |
-
title="PDF to Parquet Converter with
|
139 |
-
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset
|
140 |
)
|
141 |
|
142 |
iface.launch()
|
|
|
|
2 |
import pandas as pd
|
3 |
import fitz # PyMuPDF
|
4 |
import os
|
5 |
+
import re
|
6 |
from huggingface_hub import HfApi
|
7 |
from huggingface_hub.utils import HfHubHTTPError
|
8 |
import time
|
9 |
|
10 |
+
def extract_full_paper_with_labels(pdf_path, progress=None):
|
11 |
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
12 |
doc = fitz.open(pdf_path)
|
13 |
+
content = ""
|
14 |
+
|
15 |
+
# Initialize metadata
|
16 |
+
title = ""
|
17 |
+
authors = ""
|
18 |
+
year = ""
|
19 |
+
doi = ""
|
20 |
+
abstract = ""
|
21 |
+
footnotes = ""
|
22 |
+
references = ""
|
23 |
+
sources = ""
|
24 |
total_pages = len(doc)
|
25 |
max_iterations = total_pages * 2 # To prevent infinite loops
|
26 |
iteration_count = 0
|
27 |
|
28 |
+
# Regex patterns for detection
|
29 |
+
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
+
year_pattern = r'\b(19|20)\d{2}\b'
|
31 |
+
reference_keywords = ['reference', 'bibliography', 'sources']
|
32 |
+
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
33 |
+
|
34 |
for page_num, page in enumerate(doc):
|
35 |
iteration_count += 1
|
36 |
if iteration_count > max_iterations:
|
|
|
43 |
for block in blocks:
|
44 |
if "lines" in block:
|
45 |
text = ""
|
46 |
+
max_font_size = 0
|
47 |
for line in block["lines"]:
|
48 |
for span in line["spans"]:
|
49 |
text += span["text"] + " "
|
50 |
+
if span["size"] > max_font_size:
|
51 |
+
max_font_size = span["size"]
|
52 |
|
53 |
text = text.strip()
|
54 |
|
55 |
+
# Title (First Page, Largest Font)
|
56 |
+
if page_num == 0 and max_font_size > 15 and not title:
|
57 |
+
title = text
|
58 |
+
content += f"<TITLE>{title}</TITLE>\n"
|
59 |
+
|
60 |
+
# Authors
|
61 |
+
elif re.search(r'author|by', text, re.IGNORECASE) and not authors:
|
62 |
+
authors = text
|
63 |
+
content += f"<AUTHORS>{authors}</AUTHORS>\n"
|
64 |
+
|
65 |
+
# Year
|
66 |
+
elif re.search(year_pattern, text) and not year:
|
67 |
+
year = re.search(year_pattern, text).group(0)
|
68 |
+
content += f"<YEAR>{year}</YEAR>\n"
|
69 |
+
|
70 |
+
# DOI
|
71 |
+
elif re.search(doi_pattern, text) and not doi:
|
72 |
+
doi = re.search(doi_pattern, text).group(0)
|
73 |
+
content += f"<DOI>{doi}</DOI>\n"
|
74 |
+
|
75 |
+
# Abstract
|
76 |
+
elif "abstract" in text.lower() and not abstract:
|
77 |
+
abstract = text
|
78 |
+
content += f"<ABSTRACT>{abstract}</ABSTRACT>\n"
|
79 |
+
|
80 |
+
# Footnotes (small fonts)
|
81 |
+
elif max_font_size < 10:
|
82 |
+
footnotes += text + " "
|
83 |
+
|
84 |
+
# References
|
85 |
+
elif any(keyword in text.lower() for keyword in reference_keywords):
|
86 |
+
references += text + " "
|
87 |
+
|
88 |
+
# Enhanced Table Detection
|
89 |
+
elif re.search(r"table\s*\d+", text, re.IGNORECASE):
|
90 |
+
# Look for captions and adjacent text
|
91 |
+
content += f"<TABLE>{text}</TABLE>\n"
|
92 |
+
|
93 |
+
# Enhanced Figure Detection
|
94 |
+
elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
|
95 |
+
# Look for image bounding boxes
|
96 |
+
content += f"<FIGURE>{text}</FIGURE>\n"
|
97 |
+
|
98 |
+
# Equations (look for math symbols)
|
99 |
+
elif re.search(r"=|β|β|Β±|Γ|Ο|ΞΌ|Ο", text):
|
100 |
+
content += f"<EQUATION>{text}</EQUATION>\n"
|
101 |
+
|
102 |
+
# Code Blocks (detect indentation or keywords)
|
103 |
+
elif re.search(r"def |class |import |for |while |if |else", text):
|
104 |
+
content += f"<CODE>{text}</CODE>\n"
|
105 |
+
|
106 |
+
# Financial Metrics
|
107 |
+
elif any(fin_kw in text.lower() for fin_kw in financial_keywords):
|
108 |
+
content += f"<FINANCIAL_METRIC>{text}</FINANCIAL_METRIC>\n"
|
109 |
+
|
110 |
+
# Regular Paragraph
|
111 |
+
else:
|
112 |
+
content += f"<PARAGRAPH>{text}</PARAGRAPH>\n"
|
113 |
+
|
114 |
+
# Append Footnotes and References
|
115 |
+
if footnotes:
|
116 |
+
content += f"<FOOTNOTE>{footnotes.strip()}</FOOTNOTE>\n"
|
117 |
+
if references:
|
118 |
+
content += f"<REFERENCE>{references.strip()}</REFERENCE>\n"
|
119 |
|
120 |
print(f"β
Finished Processing PDF: {os.path.basename(pdf_path)}")
|
121 |
+
return {
|
122 |
+
"filename": os.path.basename(pdf_path),
|
123 |
+
"content": content
|
124 |
+
}
|
125 |
|
126 |
def upload_with_progress(file_path, repo_id, token, progress):
|
127 |
"""
|
|
|
165 |
if progress is not None:
|
166 |
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
167 |
|
168 |
+
# β
Step 1: Process PDF with Full Labels
|
169 |
+
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
170 |
+
all_data.append(extracted_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
print("π‘ Converting Processed Data to Parquet")
|
173 |
# β
Step 2: Convert to Parquet
|
174 |
df = pd.DataFrame(all_data)
|
175 |
+
parquet_file = 'fully_labeled_papers.parquet'
|
176 |
|
177 |
try:
|
178 |
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
|
|
207 |
gr.File(label="Download Parquet File"),
|
208 |
gr.Textbox(label="Status")
|
209 |
],
|
210 |
+
title="PDF to Parquet Converter with Full Labeling",
|
211 |
+
description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
|
212 |
)
|
213 |
|
214 |
iface.launch()
|
215 |
+
|