Jobey1 commited on
Commit
b3bb65b
Β·
verified Β·
1 Parent(s): dfa54c4

Update app.py

Browse files

Added more robust label detection.

Files changed (1) hide show
  1. app.py +97 -24
app.py CHANGED
@@ -2,19 +2,35 @@ import gradio as gr
2
  import pandas as pd
3
  import fitz # PyMuPDF
4
  import os
 
5
  from huggingface_hub import HfApi
6
  from huggingface_hub.utils import HfHubHTTPError
7
  import time
8
 
9
- def extract_paragraphs_with_headers(pdf_path, progress=None):
10
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
11
  doc = fitz.open(pdf_path)
12
- data = []
13
-
 
 
 
 
 
 
 
 
 
14
  total_pages = len(doc)
15
  max_iterations = total_pages * 2 # To prevent infinite loops
16
  iteration_count = 0
17
 
 
 
 
 
 
 
18
  for page_num, page in enumerate(doc):
19
  iteration_count += 1
20
  if iteration_count > max_iterations:
@@ -27,23 +43,85 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
27
  for block in blocks:
28
  if "lines" in block:
29
  text = ""
 
30
  for line in block["lines"]:
31
  for span in line["spans"]:
32
  text += span["text"] + " "
 
 
33
 
34
  text = text.strip()
35
 
36
- # Detect headers based on font size
37
- is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
38
-
39
- data.append({
40
- "page_num": page_num + 1,
41
- "text": text,
42
- "is_header": is_header
43
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
46
- return data
 
 
 
47
 
48
  def upload_with_progress(file_path, repo_id, token, progress):
49
  """
@@ -87,20 +165,14 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
87
  if progress is not None:
88
  progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
89
 
90
- # βœ… Step 1: Process PDF
91
- extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
92
- for item in extracted_data:
93
- all_data.append({
94
- 'filename': os.path.basename(pdf_file.name),
95
- 'page_num': item['page_num'],
96
- 'text': item['text'],
97
- 'is_header': item['is_header']
98
- })
99
 
100
  print("🟑 Converting Processed Data to Parquet")
101
  # βœ… Step 2: Convert to Parquet
102
  df = pd.DataFrame(all_data)
103
- parquet_file = 'papers_with_headers.parquet'
104
 
105
  try:
106
  df.to_parquet(parquet_file, engine='pyarrow', index=False)
@@ -135,8 +207,9 @@ iface = gr.Interface(
135
  gr.File(label="Download Parquet File"),
136
  gr.Textbox(label="Status")
137
  ],
138
- title="PDF to Parquet Converter with Correct Upload API",
139
- description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
140
  )
141
 
142
  iface.launch()
 
 
2
  import pandas as pd
3
  import fitz # PyMuPDF
4
  import os
5
+ import re
6
  from huggingface_hub import HfApi
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
10
+ def extract_full_paper_with_labels(pdf_path, progress=None):
11
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
13
+ content = ""
14
+
15
+ # Initialize metadata
16
+ title = ""
17
+ authors = ""
18
+ year = ""
19
+ doi = ""
20
+ abstract = ""
21
+ footnotes = ""
22
+ references = ""
23
+ sources = ""
24
  total_pages = len(doc)
25
  max_iterations = total_pages * 2 # To prevent infinite loops
26
  iteration_count = 0
27
 
28
+ # Regex patterns for detection
29
+ doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
+ year_pattern = r'\b(19|20)\d{2}\b'
31
+ reference_keywords = ['reference', 'bibliography', 'sources']
32
+ financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
33
+
34
  for page_num, page in enumerate(doc):
35
  iteration_count += 1
36
  if iteration_count > max_iterations:
 
43
  for block in blocks:
44
  if "lines" in block:
45
  text = ""
46
+ max_font_size = 0
47
  for line in block["lines"]:
48
  for span in line["spans"]:
49
  text += span["text"] + " "
50
+ if span["size"] > max_font_size:
51
+ max_font_size = span["size"]
52
 
53
  text = text.strip()
54
 
55
+ # Title (First Page, Largest Font)
56
+ if page_num == 0 and max_font_size > 15 and not title:
57
+ title = text
58
+ content += f"<TITLE>{title}</TITLE>\n"
59
+
60
+ # Authors
61
+ elif re.search(r'author|by', text, re.IGNORECASE) and not authors:
62
+ authors = text
63
+ content += f"<AUTHORS>{authors}</AUTHORS>\n"
64
+
65
+ # Year
66
+ elif re.search(year_pattern, text) and not year:
67
+ year = re.search(year_pattern, text).group(0)
68
+ content += f"<YEAR>{year}</YEAR>\n"
69
+
70
+ # DOI
71
+ elif re.search(doi_pattern, text) and not doi:
72
+ doi = re.search(doi_pattern, text).group(0)
73
+ content += f"<DOI>{doi}</DOI>\n"
74
+
75
+ # Abstract
76
+ elif "abstract" in text.lower() and not abstract:
77
+ abstract = text
78
+ content += f"<ABSTRACT>{abstract}</ABSTRACT>\n"
79
+
80
+ # Footnotes (small fonts)
81
+ elif max_font_size < 10:
82
+ footnotes += text + " "
83
+
84
+ # References
85
+ elif any(keyword in text.lower() for keyword in reference_keywords):
86
+ references += text + " "
87
+
88
+ # Enhanced Table Detection
89
+ elif re.search(r"table\s*\d+", text, re.IGNORECASE):
90
+ # Look for captions and adjacent text
91
+ content += f"<TABLE>{text}</TABLE>\n"
92
+
93
+ # Enhanced Figure Detection
94
+ elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
95
+ # Look for image bounding boxes
96
+ content += f"<FIGURE>{text}</FIGURE>\n"
97
+
98
+ # Equations (look for math symbols)
99
+ elif re.search(r"=|βˆ‘|√|Β±|Γ—|Ο€|ΞΌ|Οƒ", text):
100
+ content += f"<EQUATION>{text}</EQUATION>\n"
101
+
102
+ # Code Blocks (detect indentation or keywords)
103
+ elif re.search(r"def |class |import |for |while |if |else", text):
104
+ content += f"<CODE>{text}</CODE>\n"
105
+
106
+ # Financial Metrics
107
+ elif any(fin_kw in text.lower() for fin_kw in financial_keywords):
108
+ content += f"<FINANCIAL_METRIC>{text}</FINANCIAL_METRIC>\n"
109
+
110
+ # Regular Paragraph
111
+ else:
112
+ content += f"<PARAGRAPH>{text}</PARAGRAPH>\n"
113
+
114
+ # Append Footnotes and References
115
+ if footnotes:
116
+ content += f"<FOOTNOTE>{footnotes.strip()}</FOOTNOTE>\n"
117
+ if references:
118
+ content += f"<REFERENCE>{references.strip()}</REFERENCE>\n"
119
 
120
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
121
+ return {
122
+ "filename": os.path.basename(pdf_path),
123
+ "content": content
124
+ }
125
 
126
  def upload_with_progress(file_path, repo_id, token, progress):
127
  """
 
165
  if progress is not None:
166
  progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
167
 
168
+ # βœ… Step 1: Process PDF with Full Labels
169
+ extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
170
+ all_data.append(extracted_data)
 
 
 
 
 
 
171
 
172
  print("🟑 Converting Processed Data to Parquet")
173
  # βœ… Step 2: Convert to Parquet
174
  df = pd.DataFrame(all_data)
175
+ parquet_file = 'fully_labeled_papers.parquet'
176
 
177
  try:
178
  df.to_parquet(parquet_file, engine='pyarrow', index=False)
 
207
  gr.File(label="Download Parquet File"),
208
  gr.Textbox(label="Status")
209
  ],
210
+ title="PDF to Parquet Converter with Full Labeling",
211
+ description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
212
  )
213
 
214
  iface.launch()
215
+