Jobey1 commited on
Commit
c7a5739
·
verified ·
1 Parent(s): b270230

Update app.py

Browse files

Users can drag and drop PDF's or search for PDF's they want to convert to Parquet format. Users can download converted file to local machine, or can choose to load to a hugging faces data set with the API key for their repo. Now API's are collected or saved. Code checks to make sure repo exists before it transfers the files.

Files changed (1) hide show
  1. app.py +84 -26
app.py CHANGED
@@ -1,45 +1,103 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from pdfminer.high_level import extract_text
4
  import os
 
5
 
6
- def extract_paragraphs(text):
7
- # Split text into paragraphs based on double line breaks
8
- paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
9
- return paragraphs
10
-
11
- def pdf_to_parquet(pdf_files):
12
  data = []
13
 
14
- for pdf_file in pdf_files:
15
- # Extract text from PDF
16
- text = extract_text(pdf_file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Extract paragraphs
19
- paragraphs = extract_paragraphs(text)
20
 
21
- # Append to data list
22
- data.append({
23
- 'filename': os.path.basename(pdf_file.name),
24
- 'paragraphs': paragraphs
25
- })
 
 
26
 
27
  # Convert to DataFrame
28
- df = pd.DataFrame(data)
29
 
30
- # Save to Parquet
31
- parquet_file = 'converted_papers.parquet'
32
  df.to_parquet(parquet_file, engine='pyarrow', index=False)
33
 
34
- return parquet_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Gradio Interface
37
  iface = gr.Interface(
38
- fn=pdf_to_parquet,
39
- inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
40
- outputs=gr.File(label="Download Parquet File"),
41
- title="PDF to Parquet Converter with Paragraphs",
42
- description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training."
 
 
 
 
 
 
 
 
43
  )
44
 
45
  iface.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import fitz # PyMuPDF
4
  import os
5
+ from huggingface_hub import HfApi, HfHubHTTPError
6
 
7
+ def extract_paragraphs_with_headers(pdf_path):
8
+ doc = fitz.open(pdf_path)
 
 
 
 
9
  data = []
10
 
11
+ for page_num, page in enumerate(doc):
12
+ blocks = page.get_text("dict")["blocks"]
13
+ for block in blocks:
14
+ if "lines" in block:
15
+ text = ""
16
+ for line in block["lines"]:
17
+ for span in line["spans"]:
18
+ text += span["text"] + " "
19
+
20
+ text = text.strip()
21
+
22
+ # Detect headers based on font size
23
+ is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
24
+
25
+ data.append({
26
+ "page_num": page_num + 1,
27
+ "text": text,
28
+ "is_header": is_header
29
+ })
30
+
31
+ return data
32
+
33
+ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
34
+ all_data = []
35
 
36
+ for pdf_file in pdf_files:
37
+ extracted_data = extract_paragraphs_with_headers(pdf_file.name)
38
 
39
+ for item in extracted_data:
40
+ all_data.append({
41
+ 'filename': os.path.basename(pdf_file.name),
42
+ 'page_num': item['page_num'],
43
+ 'text': item['text'],
44
+ 'is_header': item['is_header']
45
+ })
46
 
47
  # Convert to DataFrame
48
+ df = pd.DataFrame(all_data)
49
 
50
+ # Save as Parquet
51
+ parquet_file = 'papers_with_headers.parquet'
52
  df.to_parquet(parquet_file, engine='pyarrow', index=False)
53
 
54
+ upload_message = ""
55
+
56
+ # Only upload if the user selects it
57
+ if action_choice in ["Upload to Hugging Face", "Both"]:
58
+ try:
59
+ api = HfApi()
60
+ api.set_access_token(hf_token)
61
+
62
+ # Validate the user's repo
63
+ try:
64
+ api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
65
+ repo_exists = True
66
+ except HfHubHTTPError:
67
+ repo_exists = False
68
+
69
+ if repo_exists:
70
+ api.upload_file(
71
+ path_or_fileobj=parquet_file,
72
+ path_in_repo='papers_with_headers.parquet',
73
+ repo_id=dataset_repo_id,
74
+ repo_type='dataset'
75
+ )
76
+ upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
77
+ else:
78
+ upload_message = "❌ Dataset repo not found. Please check the repo ID."
79
+
80
+ except Exception as e:
81
+ upload_message = f"❌ Upload failed: {str(e)}"
82
+
83
+ # Return the file for local download + upload status
84
+ return parquet_file, upload_message
85
 
86
  # Gradio Interface
87
  iface = gr.Interface(
88
+ fn=pdf_to_parquet_and_upload,
89
+ inputs=[
90
+ gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
91
+ gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
92
+ gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
93
+ gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
94
+ ],
95
+ outputs=[
96
+ gr.File(label="Download Parquet File"),
97
+ gr.Textbox(label="Status")
98
+ ],
99
+ title="PDF to Parquet Converter with User-Controlled Upload",
100
+ description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
101
  )
102
 
103
  iface.launch()