Jobey1 commited on
Commit
69c287e
Β·
verified Β·
1 Parent(s): 09053ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -95
app.py CHANGED
@@ -8,7 +8,7 @@ from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
- print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
13
  content = ""
14
 
@@ -28,7 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
28
  # Regex patterns for detection
29
  doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
  year_pattern = r'\b(19|20)\d{2}\b'
31
- code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
32
  reference_keywords = ['reference', 'bibliography', 'sources']
33
  financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
34
 
@@ -122,99 +122,39 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
122
  "content": content
123
  }
124
 
125
- def upload_with_progress(file_path, repo_id, token, progress):
126
- """
127
- Upload file to Hugging Face Dataset using upload_file() API method.
128
- """
129
- print(f"πŸ“€ Starting upload of Parquet: {file_path}")
130
- file_size = os.path.getsize(file_path)
131
-
132
- api = HfApi()
133
-
134
- try:
135
- # Use upload_file() method from huggingface_hub
136
- api.upload_file(
137
- path_or_fileobj=file_path,
138
- path_in_repo=os.path.basename(file_path),
139
- repo_id=repo_id,
140
- repo_type="dataset",
141
- token=token
142
- )
143
-
144
- if progress is not None:
145
- progress(1, desc="βœ… Upload Complete")
146
-
147
- print(f"βœ… Successfully uploaded to {repo_id}")
148
- return f"βœ… Successfully uploaded to {repo_id}"
149
-
150
- except HfHubHTTPError as e:
151
- print(f"❌ Upload failed: {e}")
152
- return f"❌ Upload failed: {str(e)}"
153
- except Exception as e:
154
- print(f"❌ Unexpected error: {e}")
155
- return f"❌ Unexpected error: {str(e)}"
156
-
157
- def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, state, progress=gr.Progress()):
158
- all_data = []
159
-
160
- total_files = len(pdf_files)
161
- print("πŸš€ Starting PDF to Parquet Conversion Process")
162
-
163
- for idx, pdf_file in enumerate(pdf_files):
164
- if progress:
165
- progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
166
-
167
- extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
168
- all_data.append(extracted_data)
169
-
170
- print("🟑 Converting Processed Data to Parquet")
171
- df = pd.DataFrame(all_data)
172
- parquet_file = 'fully_labeled_papers.parquet'
173
-
174
- try:
175
- df.to_parquet(parquet_file, engine='pyarrow', index=False)
176
- print("βœ… Parquet Conversion Completed")
177
- except Exception as e:
178
- print(f"❌ Parquet Conversion Failed: {str(e)}")
179
- return None, f"❌ Parquet Conversion Failed: {str(e)}", state
180
-
181
- upload_message = "Skipped Upload"
182
-
183
- # βœ… Upload Parquet if selected
184
- if action_choice in ["Upload to Hugging Face", "Both"]:
185
- try:
186
- upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
187
- except Exception as e:
188
- print(f"❌ Upload Failed: {str(e)}")
189
- upload_message = f"❌ Upload failed: {str(e)}"
190
-
191
- print("🏁 Process Completed")
192
-
193
- # βœ… Clear Uploaded PDFs and Parquet File
194
- if os.path.exists(parquet_file):
195
- os.remove(parquet_file)
196
- print("πŸ—‘οΈ Parquet file cleared after processing.")
197
-
198
- return None, upload_message, state
199
-
200
- # βœ… Gradio Interface
201
- iface = gr.Interface(
202
- fn=pdf_to_parquet_and_upload,
203
- inputs=[
204
- gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
205
- gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
206
- gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
207
- gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
208
- ],
209
- outputs=[
210
- gr.File(label="Download Parquet File"),
211
- gr.Textbox(label="Status")
212
- ],
213
- title="PDF to Parquet Converter with Full Labeling",
214
- description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
215
- )
216
-
217
- iface.launch()
218
 
219
 
220
 
 
8
  import time
9
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
+ print(f"πŸ“ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
13
  content = ""
14
 
 
28
  # Regex patterns for detection
29
  doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
  year_pattern = r'\b(19|20)\d{2}\b'
31
+ code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|}|;)"
32
  reference_keywords = ['reference', 'bibliography', 'sources']
33
  financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
34
 
 
122
  "content": content
123
  }
124
 
125
+ # NEW: Function to clear file-related inputs/outputs only.
126
+ def clear_files():
127
+ # Return empty values for file input and output display.
128
+ # Notice that we do NOT return anything for the API key or repo address.
129
+ return None, "", None
130
+
131
+ # Gradio interface setup
132
+ with gr.Blocks() as demo:
133
+ with gr.Row():
134
+ api_key_input = gr.Textbox(label="API Key", placeholder="Enter API Key")
135
+ repo_address_input = gr.Textbox(label="Repo Address", placeholder="Enter Repo Address")
136
+ with gr.Row():
137
+ pdf_file_input = gr.File(label="Upload PDF")
138
+ convert_button = gr.Button("Convert to Parquet")
139
+ clear_button = gr.Button("Clear Files")
140
+ output_display = gr.Textbox(label="Output")
141
+ # (Optional) A hidden textbox for parquet data, if used later.
142
+ parquet_output = gr.Textbox(label="Parquet Data", visible=False)
143
+
144
+ convert_button.click(
145
+ extract_full_paper_with_labels,
146
+ inputs=pdf_file_input,
147
+ outputs=output_display
148
+ )
149
+ # The clear button now only clears file-related components;
150
+ # API key and Repo Address remain untouched.
151
+ clear_button.click(
152
+ clear_files,
153
+ inputs=None,
154
+ outputs=[pdf_file_input, output_display, parquet_output]
155
+ )
156
+
157
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
 
160