Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ from huggingface_hub.utils import HfHubHTTPError
|
|
8 |
import time
|
9 |
|
10 |
def extract_full_paper_with_labels(pdf_path, progress=None):
|
11 |
-
print(f"
|
12 |
doc = fitz.open(pdf_path)
|
13 |
content = ""
|
14 |
|
@@ -28,7 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
28 |
# Regex patterns for detection
|
29 |
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
year_pattern = r'\b(19|20)\d{2}\b'
|
31 |
-
code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{
|
32 |
reference_keywords = ['reference', 'bibliography', 'sources']
|
33 |
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
34 |
|
@@ -122,65 +122,131 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
-
def
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
try:
|
143 |
-
|
144 |
-
path_or_fileobj=parquet_filename,
|
145 |
-
path_in_repo=parquet_filename,
|
146 |
-
repo_id=repo_address,
|
147 |
-
token=api_key
|
148 |
-
)
|
149 |
-
repo_status = f"File uploaded to repo {repo_address} successfully."
|
150 |
except Exception as e:
|
151 |
-
|
152 |
-
|
153 |
-
repo_status = "API key or repo address not provided, skipping repo upload."
|
154 |
-
|
155 |
-
return parquet_filename, repo_status
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
160 |
|
161 |
-
# Gradio interface setup
|
162 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
with gr.Row():
|
164 |
-
|
165 |
-
|
166 |
with gr.Row():
|
167 |
-
|
168 |
-
|
169 |
-
clear_button = gr.Button("Clear Files")
|
170 |
with gr.Row():
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
convert_button.click(
|
175 |
-
|
176 |
-
inputs=[
|
177 |
-
outputs=[
|
178 |
)
|
179 |
-
|
180 |
-
|
|
|
181 |
inputs=None,
|
182 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
)
|
184 |
|
185 |
demo.launch()
|
186 |
|
|
|
|
8 |
import time
|
9 |
|
10 |
def extract_full_paper_with_labels(pdf_path, progress=None):
|
11 |
+
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
12 |
doc = fitz.open(pdf_path)
|
13 |
content = ""
|
14 |
|
|
|
28 |
# Regex patterns for detection
|
29 |
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
year_pattern = r'\b(19|20)\d{2}\b'
|
31 |
+
code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
|
32 |
reference_keywords = ['reference', 'bibliography', 'sources']
|
33 |
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
34 |
|
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
+
def upload_with_progress(file_path, repo_id, token, progress):
|
126 |
+
"""
|
127 |
+
Upload file to Hugging Face Dataset using upload_file() API method.
|
128 |
+
"""
|
129 |
+
print(f"π€ Starting upload of Parquet: {file_path}")
|
130 |
+
file_size = os.path.getsize(file_path)
|
131 |
+
|
132 |
+
api = HfApi()
|
133 |
+
|
134 |
+
try:
|
135 |
+
# Use upload_file() method from huggingface_hub
|
136 |
+
api.upload_file(
|
137 |
+
path_or_fileobj=file_path,
|
138 |
+
path_in_repo=os.path.basename(file_path),
|
139 |
+
repo_id=repo_id,
|
140 |
+
repo_type="dataset",
|
141 |
+
token=token
|
142 |
+
)
|
143 |
+
|
144 |
+
if progress is not None:
|
145 |
+
progress(1, desc="β
Upload Complete")
|
146 |
+
|
147 |
+
print(f"β
Successfully uploaded to {repo_id}")
|
148 |
+
return f"β
Successfully uploaded to {repo_id}"
|
149 |
+
|
150 |
+
except HfHubHTTPError as e:
|
151 |
+
print(f"β Upload failed: {e}")
|
152 |
+
return f"β Upload failed: {str(e)}"
|
153 |
+
except Exception as e:
|
154 |
+
print(f"β Unexpected error: {e}")
|
155 |
+
return f"β Unexpected error: {str(e)}"
|
156 |
+
|
157 |
+
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
158 |
+
all_data = []
|
159 |
+
|
160 |
+
total_files = len(pdf_files)
|
161 |
+
print("π Starting PDF to Parquet Conversion Process")
|
162 |
+
|
163 |
+
for idx, pdf_file in enumerate(pdf_files):
|
164 |
+
if progress is not None:
|
165 |
+
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
166 |
+
|
167 |
+
# β
Step 1: Process PDF with Full Labels
|
168 |
+
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
169 |
+
all_data.append(extracted_data)
|
170 |
+
|
171 |
+
print("π‘ Converting Processed Data to Parquet")
|
172 |
+
# β
Step 2: Convert to Parquet
|
173 |
+
df = pd.DataFrame(all_data)
|
174 |
+
parquet_file = 'fully_labeled_papers.parquet'
|
175 |
+
|
176 |
+
try:
|
177 |
+
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
178 |
+
print("β
Parquet Conversion Completed")
|
179 |
+
except Exception as e:
|
180 |
+
print(f"β Parquet Conversion Failed: {str(e)}")
|
181 |
+
return None, f"β Parquet Conversion Failed: {str(e)}"
|
182 |
+
|
183 |
+
upload_message = "Skipped Upload"
|
184 |
+
|
185 |
+
# β
Step 3: Upload Parquet (if selected)
|
186 |
+
if action_choice in ["Upload to Hugging Face", "Both"]:
|
187 |
try:
|
188 |
+
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
except Exception as e:
|
190 |
+
print(f"β Upload Failed: {str(e)}")
|
191 |
+
upload_message = f"β Upload failed: {str(e)}"
|
|
|
|
|
|
|
192 |
|
193 |
+
print("π Process Completed")
|
194 |
+
return parquet_file, upload_message
|
195 |
+
|
196 |
+
# Define a function for our custom "Reset Files Only" button.
|
197 |
+
def reset_files_fn():
|
198 |
+
# Return None for both the file input and the output file, clearing them.
|
199 |
+
return None, None
|
200 |
|
|
|
201 |
with gr.Blocks() as demo:
|
202 |
+
gr.Markdown(
|
203 |
+
"""
|
204 |
+
# PDF to Parquet Converter with Full Labeling
|
205 |
+
|
206 |
+
**Clear All Inputs:** The button below (labeled "Clear All Inputs") will reset every field, including your API key and dataset repo ID.
|
207 |
+
**Reset Files Only:** Use this button if you want to clear the PDF file uploads and the generated Parquet file, while keeping your credentials intact.
|
208 |
+
"""
|
209 |
+
)
|
210 |
+
|
211 |
+
with gr.Row():
|
212 |
+
pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)")
|
213 |
with gr.Row():
|
214 |
+
hf_token = gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token")
|
215 |
+
dataset_repo = gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset")
|
216 |
with gr.Row():
|
217 |
+
action_radio = gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
|
218 |
+
|
|
|
219 |
with gr.Row():
|
220 |
+
convert_button = gr.Button("Convert PDF to Parquet")
|
221 |
+
reset_files_button = gr.Button("Reset Files Only")
|
222 |
+
clear_all_button = gr.Button("Clear All Inputs")
|
223 |
+
|
224 |
+
with gr.Row():
|
225 |
+
output_file = gr.File(label="Download Parquet File")
|
226 |
+
status_text = gr.Textbox(label="Status")
|
227 |
|
228 |
convert_button.click(
|
229 |
+
fn=pdf_to_parquet_and_upload,
|
230 |
+
inputs=[pdf_input, hf_token, dataset_repo, action_radio],
|
231 |
+
outputs=[output_file, status_text]
|
232 |
)
|
233 |
+
|
234 |
+
reset_files_button.click(
|
235 |
+
fn=reset_files_fn,
|
236 |
inputs=None,
|
237 |
+
outputs=[pdf_input, output_file]
|
238 |
+
)
|
239 |
+
|
240 |
+
# The Clear All button resets every input field.
|
241 |
+
def clear_all_fn():
|
242 |
+
return None, None, None, "Download Locally"
|
243 |
+
|
244 |
+
clear_all_button.click(
|
245 |
+
fn=clear_all_fn,
|
246 |
+
inputs=None,
|
247 |
+
outputs=[pdf_input, hf_token, dataset_repo, action_radio]
|
248 |
)
|
249 |
|
250 |
demo.launch()
|
251 |
|
252 |
+
|