Update app.py
Browse files
app.py
CHANGED
@@ -122,11 +122,43 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
def clear_files():
|
127 |
-
|
128 |
-
# Notice that we do NOT return anything for the API key or repo address.
|
129 |
-
return None, "", None
|
130 |
|
131 |
# Gradio interface setup
|
132 |
with gr.Blocks() as demo:
|
@@ -137,24 +169,20 @@ with gr.Blocks() as demo:
|
|
137 |
pdf_file_input = gr.File(label="Upload PDF")
|
138 |
convert_button = gr.Button("Convert to Parquet")
|
139 |
clear_button = gr.Button("Clear Files")
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
convert_button.click(
|
145 |
-
|
146 |
-
inputs=pdf_file_input,
|
147 |
-
outputs=
|
148 |
)
|
149 |
-
# The clear button now only clears file-related components;
|
150 |
-
# API key and Repo Address remain untouched.
|
151 |
clear_button.click(
|
152 |
clear_files,
|
153 |
inputs=None,
|
154 |
-
outputs=[pdf_file_input,
|
155 |
)
|
156 |
|
157 |
demo.launch()
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
+
def process_pdf_file(pdf_file, api_key, repo_address):
|
126 |
+
if pdf_file is None:
|
127 |
+
return None, "No PDF file uploaded."
|
128 |
+
# Extract content from PDF.
|
129 |
+
# pdf_file can be a file-like object or a dict depending on how Gradio returns it.
|
130 |
+
file_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file['name']
|
131 |
+
result = extract_full_paper_with_labels(file_path)
|
132 |
+
|
133 |
+
# Convert the result dictionary into a DataFrame and write it to a parquet file.
|
134 |
+
df = pd.DataFrame([result])
|
135 |
+
base = os.path.splitext(result['filename'])[0]
|
136 |
+
parquet_filename = f"{base}.parquet"
|
137 |
+
df.to_parquet(parquet_filename, index=False)
|
138 |
+
|
139 |
+
repo_status = ""
|
140 |
+
# If API key and repo address are provided, attempt to upload the parquet file.
|
141 |
+
if api_key and repo_address:
|
142 |
+
api = HfApi()
|
143 |
+
try:
|
144 |
+
api.upload_file(
|
145 |
+
path_or_fileobj=parquet_filename,
|
146 |
+
path_in_repo=parquet_filename,
|
147 |
+
repo_id=repo_address,
|
148 |
+
token=api_key
|
149 |
+
)
|
150 |
+
repo_status = f"File uploaded to repo {repo_address} successfully."
|
151 |
+
except Exception as e:
|
152 |
+
repo_status = f"Failed to upload to repo: {str(e)}"
|
153 |
+
else:
|
154 |
+
repo_status = "API key or repo address not provided, skipping repo upload."
|
155 |
+
|
156 |
+
# Return the parquet file for local download and the status message.
|
157 |
+
return parquet_filename, repo_status
|
158 |
+
|
159 |
+
# Function to clear only file-related inputs/outputs, preserving the API key and repo address.
|
160 |
def clear_files():
|
161 |
+
return None, None, ""
|
|
|
|
|
162 |
|
163 |
# Gradio interface setup
|
164 |
with gr.Blocks() as demo:
|
|
|
169 |
pdf_file_input = gr.File(label="Upload PDF")
|
170 |
convert_button = gr.Button("Convert to Parquet")
|
171 |
clear_button = gr.Button("Clear Files")
|
172 |
+
with gr.Row():
|
173 |
+
download_file_output = gr.File(label="Download Parquet File")
|
174 |
+
repo_status_output = gr.Textbox(label="Repo Upload Status")
|
175 |
+
|
176 |
convert_button.click(
|
177 |
+
process_pdf_file,
|
178 |
+
inputs=[pdf_file_input, api_key_input, repo_address_input],
|
179 |
+
outputs=[download_file_output, repo_status_output]
|
180 |
)
|
181 |
+
# The clear button now only clears file-related components; API key and Repo Address remain unchanged.
|
|
|
182 |
clear_button.click(
|
183 |
clear_files,
|
184 |
inputs=None,
|
185 |
+
outputs=[pdf_file_input, download_file_output, repo_status_output]
|
186 |
)
|
187 |
|
188 |
demo.launch()
|
|
|
|
|
|