Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ from huggingface_hub.utils import HfHubHTTPError
|
|
8 |
import time
|
9 |
|
10 |
def extract_full_paper_with_labels(pdf_path, progress=None):
|
11 |
-
print(f"
|
12 |
doc = fitz.open(pdf_path)
|
13 |
content = ""
|
14 |
|
@@ -28,7 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
28 |
# Regex patterns for detection
|
29 |
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
year_pattern = r'\b(19|20)\d{2}\b'
|
31 |
-
code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{
|
32 |
reference_keywords = ['reference', 'bibliography', 'sources']
|
33 |
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
34 |
|
@@ -122,99 +122,39 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
all_data = []
|
159 |
-
|
160 |
-
total_files = len(pdf_files)
|
161 |
-
print("π Starting PDF to Parquet Conversion Process")
|
162 |
-
|
163 |
-
for idx, pdf_file in enumerate(pdf_files):
|
164 |
-
if progress:
|
165 |
-
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
166 |
-
|
167 |
-
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
168 |
-
all_data.append(extracted_data)
|
169 |
-
|
170 |
-
print("π‘ Converting Processed Data to Parquet")
|
171 |
-
df = pd.DataFrame(all_data)
|
172 |
-
parquet_file = 'fully_labeled_papers.parquet'
|
173 |
-
|
174 |
-
try:
|
175 |
-
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
176 |
-
print("β
Parquet Conversion Completed")
|
177 |
-
except Exception as e:
|
178 |
-
print(f"β Parquet Conversion Failed: {str(e)}")
|
179 |
-
return None, f"β Parquet Conversion Failed: {str(e)}", state
|
180 |
-
|
181 |
-
upload_message = "Skipped Upload"
|
182 |
-
|
183 |
-
# β
Upload Parquet if selected
|
184 |
-
if action_choice in ["Upload to Hugging Face", "Both"]:
|
185 |
-
try:
|
186 |
-
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
|
187 |
-
except Exception as e:
|
188 |
-
print(f"β Upload Failed: {str(e)}")
|
189 |
-
upload_message = f"β Upload failed: {str(e)}"
|
190 |
-
|
191 |
-
print("π Process Completed")
|
192 |
-
|
193 |
-
# β
Clear Uploaded PDFs and Parquet File
|
194 |
-
if os.path.exists(parquet_file):
|
195 |
-
os.remove(parquet_file)
|
196 |
-
print("ποΈ Parquet file cleared after processing.")
|
197 |
-
|
198 |
-
return None, upload_message, state
|
199 |
-
|
200 |
-
# β
Gradio Interface
|
201 |
-
iface = gr.Interface(
|
202 |
-
fn=pdf_to_parquet_and_upload,
|
203 |
-
inputs=[
|
204 |
-
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
|
205 |
-
gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
|
206 |
-
gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
|
207 |
-
gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
|
208 |
-
],
|
209 |
-
outputs=[
|
210 |
-
gr.File(label="Download Parquet File"),
|
211 |
-
gr.Textbox(label="Status")
|
212 |
-
],
|
213 |
-
title="PDF to Parquet Converter with Full Labeling",
|
214 |
-
description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
|
215 |
-
)
|
216 |
-
|
217 |
-
iface.launch()
|
218 |
|
219 |
|
220 |
|
|
|
8 |
import time
|
9 |
|
10 |
def extract_full_paper_with_labels(pdf_path, progress=None):
|
11 |
+
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
12 |
doc = fitz.open(pdf_path)
|
13 |
content = ""
|
14 |
|
|
|
28 |
# Regex patterns for detection
|
29 |
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
year_pattern = r'\b(19|20)\d{2}\b'
|
31 |
+
code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|}|;)"
|
32 |
reference_keywords = ['reference', 'bibliography', 'sources']
|
33 |
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
34 |
|
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
+
# NEW: Function to clear file-related inputs/outputs only.
|
126 |
+
def clear_files():
|
127 |
+
# Return empty values for file input and output display.
|
128 |
+
# Notice that we do NOT return anything for the API key or repo address.
|
129 |
+
return None, "", None
|
130 |
+
|
131 |
+
# Gradio interface setup
|
132 |
+
with gr.Blocks() as demo:
|
133 |
+
with gr.Row():
|
134 |
+
api_key_input = gr.Textbox(label="API Key", placeholder="Enter API Key")
|
135 |
+
repo_address_input = gr.Textbox(label="Repo Address", placeholder="Enter Repo Address")
|
136 |
+
with gr.Row():
|
137 |
+
pdf_file_input = gr.File(label="Upload PDF")
|
138 |
+
convert_button = gr.Button("Convert to Parquet")
|
139 |
+
clear_button = gr.Button("Clear Files")
|
140 |
+
output_display = gr.Textbox(label="Output")
|
141 |
+
# (Optional) A hidden textbox for parquet data, if used later.
|
142 |
+
parquet_output = gr.Textbox(label="Parquet Data", visible=False)
|
143 |
+
|
144 |
+
convert_button.click(
|
145 |
+
extract_full_paper_with_labels,
|
146 |
+
inputs=pdf_file_input,
|
147 |
+
outputs=output_display
|
148 |
+
)
|
149 |
+
# The clear button now only clears file-related components;
|
150 |
+
# API key and Repo Address remain untouched.
|
151 |
+
clear_button.click(
|
152 |
+
clear_files,
|
153 |
+
inputs=None,
|
154 |
+
outputs=[pdf_file_input, output_display, parquet_output]
|
155 |
+
)
|
156 |
+
|
157 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
|
160 |
|