jayyai commited on
Commit
b71fd6c
·
verified ·
1 Parent(s): dfe93bc
Files changed (3) hide show
  1. app.py +1 -1
  2. pdf_route.py +64 -122
  3. requirements.txt +2 -1
app.py CHANGED
@@ -14,7 +14,7 @@ app = FastAPI(
14
  # Add CORS middleware
15
  app.add_middleware(
16
  CORSMiddleware,
17
- allow_origins=["https://readmytable.vercel.app"], # Allows all origins
18
  allow_credentials=True,
19
  allow_methods=["*"], # Allows all methods
20
  allow_headers=["*"], # Allows all headers
 
14
  # Add CORS middleware
15
  app.add_middleware(
16
  CORSMiddleware,
17
+ allow_origins=["https://readmytable.vercel.app", "http://localhost:3000"], # Allows all origins
18
  allow_credentials=True,
19
  allow_methods=["*"], # Allows all methods
20
  allow_headers=["*"], # Allows all headers
pdf_route.py CHANGED
@@ -34,7 +34,7 @@ async def convert_to_markdown(file: UploadFile = File(...)):
34
 
35
  # Analyze the document
36
  result = analyze_document(temp_pdf_path)
37
-
38
  # Create markdown file
39
  temp_md_path = "temp.md"
40
  create_markdown_file(result, temp_md_path)
@@ -71,13 +71,25 @@ async def convert_to_excel(file: UploadFile = File(...)):
71
  try:
72
  # Read the markdown content
73
  content = await file.read()
74
- markdown_text = content.decode('utf-8')
75
 
76
- # Extract tables from markdown
77
- tables = extract_tables_from_markdown(markdown_text)
 
 
78
 
79
- if not tables:
80
- raise HTTPException(status_code=400, detail="No tables found in the markdown content")
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  # Create Excel file
83
  excel_buffer = create_excel_from_markdown_tables(tables)
@@ -104,20 +116,27 @@ async def convert_to_word(file: UploadFile = File(...)):
104
  StreamingResponse: Word document file
105
  """
106
  try:
107
- # Read the markdown content
108
  content = await file.read()
109
- markdown_text = content.decode('utf-8')
110
 
111
- # Create Word file
112
- temp_docx_path = "temp.docx"
113
- create_word_from_markdown(markdown_text, temp_docx_path)
 
 
 
 
 
 
 
114
 
115
- # Read the Word file
116
- with open(temp_docx_path, "rb") as f:
117
  word_content = f.read()
118
 
119
- # Clean up temporary file
120
- os.remove(temp_docx_path)
 
121
 
122
  # Return the Word file as a download
123
  return StreamingResponse(
@@ -134,6 +153,7 @@ async def convert_to_word(file: UploadFile = File(...)):
134
  def analyze_document(file_path):
135
  """Analyze document using Azure Form Recognizer"""
136
  endpoint = "https://aal-ocr-ai-azureapi.cognitiveservices.azure.com/"
 
137
  key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
138
 
139
  document_analysis_client = DocumentAnalysisClient(
@@ -144,7 +164,7 @@ def analyze_document(file_path):
144
  poller = document_analysis_client.begin_analyze_document(
145
  "prebuilt-layout", document=f
146
  )
147
-
148
  result = poller.result()
149
  return result
150
 
@@ -185,86 +205,16 @@ def create_excel_from_markdown_tables(tables):
185
  """Create Excel file from markdown tables"""
186
  excel_buffer = BytesIO()
187
 
188
- with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
189
  for i, table in enumerate(tables):
190
- if table:
191
- # Convert table to DataFrame
192
- df = pd.DataFrame(table[1:], columns=table[0])
193
-
194
- # Save to Excel sheet
195
- sheet_name = f"Table_{i+1}"
196
- df.to_excel(writer, sheet_name=sheet_name, index=False)
197
 
198
  excel_buffer.seek(0)
199
  return excel_buffer
200
 
201
- def create_word_from_markdown(markdown_text, output_file):
202
- """Create Word document from markdown text"""
203
- doc = Document()
204
-
205
- lines = markdown_text.split('\n')
206
- current_table = []
207
- in_table = False
208
-
209
- for line in lines:
210
- # Handle headers
211
- if line.startswith('#'):
212
- level = len(line.split()[0]) # Count the number of '#'
213
- text = line.lstrip('#').strip()
214
- doc.add_heading(text, level=min(level, 9))
215
-
216
- # Handle tables
217
- elif '|' in line:
218
- # Skip separator lines
219
- if re.match(r'^[\s|:-]+$', line):
220
- continue
221
-
222
- # Process table row
223
- cells = [cell.strip() for cell in line.split('|')[1:-1]]
224
- if cells:
225
- if not in_table:
226
- in_table = True
227
- current_table = []
228
- current_table.append(cells)
229
-
230
- # Handle end of table
231
- elif in_table:
232
- if current_table:
233
- table = doc.add_table(rows=len(current_table), cols=len(current_table[0]))
234
- table.style = 'Table Grid'
235
-
236
- for i, row in enumerate(current_table):
237
- for j, cell in enumerate(row):
238
- table.cell(i, j).text = cell
239
-
240
- doc.add_paragraph() # Add space after table
241
- current_table = []
242
- in_table = False
243
-
244
- # Handle checkbox lists
245
- elif line.strip().startswith('- ['):
246
- p = doc.add_paragraph()
247
- run = p.add_run()
248
- if 'x' in line or 'X' in line:
249
- run.add_text("☑ " + line[5:].strip())
250
- else:
251
- run.add_text("☐ " + line[5:].strip())
252
-
253
- # Handle regular paragraphs
254
- elif line.strip():
255
- doc.add_paragraph(line.strip())
256
-
257
- # Handle the last table if exists
258
- if in_table and current_table:
259
- table = doc.add_table(rows=len(current_table), cols=len(current_table[0]))
260
- table.style = 'Table Grid'
261
-
262
- for i, row in enumerate(current_table):
263
- for j, cell in enumerate(row):
264
- table.cell(i, j).text = cell
265
-
266
- doc.save(output_file)
267
-
268
  def create_markdown_file(result, output_file):
269
  """Create markdown file from analysis result"""
270
  with open(output_file, 'w', encoding='utf-8') as md_file:
@@ -272,7 +222,7 @@ def create_markdown_file(result, output_file):
272
  # md_file.write(f"### Page {page.page_number}\n\n")
273
 
274
  elements = []
275
- elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.05, 'paragraph', paragraph)
276
  for paragraph in result.paragraphs if paragraph.bounding_regions[0].page_number == page.page_number])
277
  elements.sort(key=lambda x: x[0])
278
 
@@ -294,9 +244,8 @@ def create_markdown_file(result, output_file):
294
  elements = [element for element in elements if element[2] != title_paragraph]
295
  md_file.write(f"# {title_paragraph.content}\n\n")
296
 
297
- elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.05, 'table', table)
298
  for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
299
- elements.extend([(mark.polygon[0].y + mark.polygon[0].x*0.05, 'selection_mark', mark) for mark in page.selection_marks])
300
 
301
  elements.sort(key=lambda x: x[0])
302
 
@@ -305,7 +254,8 @@ def create_markdown_file(result, output_file):
305
  if element_type == 'paragraph':
306
  if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
307
  continue
308
- md_file.write(f"{element.content}\n\n")
 
309
 
310
  elif element_type == 'table':
311
  for row_idx in range(element.row_count):
@@ -314,18 +264,12 @@ def create_markdown_file(result, output_file):
314
  cell_content = ""
315
  for cell in element.cells:
316
  if cell.row_index == row_idx and cell.column_index == col_idx:
317
- cell_content = cell.content
318
  table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
319
  break
320
  row_content += f"{cell_content} | "
321
  md_file.write(row_content + "\n")
322
  md_file.write("\n")
323
-
324
- elif element_type == 'selection_mark':
325
- if element.state == "selected":
326
- md_file.write("- [x] \n\n")
327
- else:
328
- md_file.write("- [ ] \n\n")
329
 
330
  def create_word_file(result, output_file):
331
  """Create Word document from analysis result"""
@@ -334,6 +278,7 @@ def create_word_file(result, output_file):
334
 
335
  # Analyze pages
336
  for page in result.pages:
 
337
  # Combine paragraphs, tables, and selection marks in the order they appear on the page
338
  elements = []
339
  elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
@@ -357,13 +302,13 @@ def create_word_file(result, output_file):
357
 
358
  if title_paragraph:
359
  elements = [element for element in elements if element[2] != title_paragraph]
360
- doc.add_heading(title_paragraph.content, level=1)
 
 
361
 
362
  # Continuous combine paragraphs, tables, and selection marks in the order they appear on the page
363
  elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
364
  for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
365
- elements.extend([(mark.polygon[0].y + mark.polygon[0].x*0.01, 'selection_mark', mark)
366
- for mark in page.selection_marks])
367
 
368
  # Sort elements by the sum of their horizontal and vertical positions on the page
369
  elements.sort(key=lambda x: x[0])
@@ -375,7 +320,8 @@ def create_word_file(result, output_file):
375
  # Skip lines that are part of a table
376
  if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
377
  continue
378
- doc.add_paragraph(element.content)
 
379
  elif element_type == 'table':
380
  table = doc.add_table(rows=element.row_count, cols=element.column_count)
381
  table.style = 'Table Grid'
@@ -385,18 +331,10 @@ def create_word_file(result, output_file):
385
  cell_content = ""
386
  for cell in element.cells:
387
  if cell.row_index == row_idx and cell.column_index == col_idx:
388
- cell_content = cell.content
389
  table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
390
  break
391
  row_cells[col_idx].text = cell_content
392
- elif element_type == 'selection_mark':
393
- p = doc.add_paragraph()
394
- run = p.add_run()
395
- if element.state == "selected":
396
- run.add_text("☑ ")
397
- else:
398
- run.add_text("☐ ")
399
-
400
  # Save Word document
401
  doc.save(output_file)
402
 
@@ -407,19 +345,23 @@ def format_polygon(polygon):
407
  return ", ".join([f"[{p.x}, {p.y}]" for p in polygon])
408
 
409
  def get_table_max_polygon(table):
410
- """Get the maximum polygon coordinates for a table"""
411
  first_cell = table.cells[0]
412
  first_coordinate = first_cell.bounding_regions[0].polygon[0]
 
 
413
  last_cell = table.cells[-1]
414
- last_coordinate = last_cell.bounding_regions[0].polygon[-1]
 
 
415
  return [first_coordinate, last_coordinate]
416
 
417
  def is_element_inside_table(element, table_max_polygon):
418
- """Check if an element is inside a table"""
419
- element_x = element.bounding_regions[0].polygon[0].x
420
- element_y = element.bounding_regions[0].polygon[0].y
421
  first_coordinate = table_max_polygon[0]
422
- last_coordinate = table_max_polygon[1]
423
 
424
  return (first_coordinate.x <= element_x <= last_coordinate.x and
425
- first_coordinate.y <= element_y <= last_coordinate.y)
 
34
 
35
  # Analyze the document
36
  result = analyze_document(temp_pdf_path)
37
+
38
  # Create markdown file
39
  temp_md_path = "temp.md"
40
  create_markdown_file(result, temp_md_path)
 
71
  try:
72
  # Read the markdown content
73
  content = await file.read()
 
74
 
75
+ # Save the content to a temporary file
76
+ temp_pdf_path = "temp.pdf"
77
+ with open(temp_pdf_path, "wb") as f:
78
+ f.write(content)
79
 
80
+ # Analyze the document
81
+ result = analyze_document(temp_pdf_path)
82
+
83
+ tables = []
84
+ for table in result.tables:
85
+ table_data = []
86
+ for cell in table.cells:
87
+ table_data.append({
88
+ "row_index": cell.row_index,
89
+ "column_index": cell.column_index,
90
+ "text": cell.content
91
+ })
92
+ tables.append(table_data)
93
 
94
  # Create Excel file
95
  excel_buffer = create_excel_from_markdown_tables(tables)
 
116
  StreamingResponse: Word document file
117
  """
118
  try:
119
+ # Read the uploaded file content
120
  content = await file.read()
 
121
 
122
+ # Save the content to a temporary file
123
+ temp_pdf_path = "temp.pdf"
124
+ with open(temp_pdf_path, "wb") as f:
125
+ f.write(content)
126
+
127
+ # Analyze the document
128
+ result = analyze_document(temp_pdf_path)
129
+ # Create word file
130
+ temp_word_path = "temp.docx"
131
+ create_word_file(result, temp_word_path)
132
 
133
+ # Read the word file
134
+ with open(temp_word_path, "rb") as f:
135
  word_content = f.read()
136
 
137
+ # Clean up temporary files
138
+ os.remove(temp_pdf_path)
139
+ os.remove(temp_word_path)
140
 
141
  # Return the Word file as a download
142
  return StreamingResponse(
 
153
  def analyze_document(file_path):
154
  """Analyze document using Azure Form Recognizer"""
155
  endpoint = "https://aal-ocr-ai-azureapi.cognitiveservices.azure.com/"
156
+ # endpoint = "https://zzaocrtool.cognitiveservices.azure.com/"
157
  key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
158
 
159
  document_analysis_client = DocumentAnalysisClient(
 
164
  poller = document_analysis_client.begin_analyze_document(
165
  "prebuilt-layout", document=f
166
  )
167
+
168
  result = poller.result()
169
  return result
170
 
 
205
  """Create Excel file from markdown tables"""
206
  excel_buffer = BytesIO()
207
 
208
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
209
  for i, table in enumerate(tables):
210
+ df = pd.DataFrame(table)
211
+ df_pivot = df.pivot(index='row_index', columns='column_index', values='text')
212
+ sheet_name = f'Sheet{i+1}'
213
+ df_pivot.to_excel(writer, sheet_name=sheet_name, index=False)
 
 
 
214
 
215
  excel_buffer.seek(0)
216
  return excel_buffer
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def create_markdown_file(result, output_file):
219
  """Create markdown file from analysis result"""
220
  with open(output_file, 'w', encoding='utf-8') as md_file:
 
222
  # md_file.write(f"### Page {page.page_number}\n\n")
223
 
224
  elements = []
225
+ elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
226
  for paragraph in result.paragraphs if paragraph.bounding_regions[0].page_number == page.page_number])
227
  elements.sort(key=lambda x: x[0])
228
 
 
244
  elements = [element for element in elements if element[2] != title_paragraph]
245
  md_file.write(f"# {title_paragraph.content}\n\n")
246
 
247
+ elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
248
  for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
 
249
 
250
  elements.sort(key=lambda x: x[0])
251
 
 
254
  if element_type == 'paragraph':
255
  if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
256
  continue
257
+ content = element.content.replace(":selected:", "").replace(":unselected:", "")
258
+ md_file.write(f"{content}\n\n")
259
 
260
  elif element_type == 'table':
261
  for row_idx in range(element.row_count):
 
264
  cell_content = ""
265
  for cell in element.cells:
266
  if cell.row_index == row_idx and cell.column_index == col_idx:
267
+ cell_content = cell.content.replace(":selected:", "").replace(":unselected:", "")
268
  table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
269
  break
270
  row_content += f"{cell_content} | "
271
  md_file.write(row_content + "\n")
272
  md_file.write("\n")
 
 
 
 
 
 
273
 
274
  def create_word_file(result, output_file):
275
  """Create Word document from analysis result"""
 
278
 
279
  # Analyze pages
280
  for page in result.pages:
281
+ doc.add_heading(f"File Page {page.page_number}", level=2)
282
  # Combine paragraphs, tables, and selection marks in the order they appear on the page
283
  elements = []
284
  elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
 
302
 
303
  if title_paragraph:
304
  elements = [element for element in elements if element[2] != title_paragraph]
305
+
306
+ title = title_paragraph
307
+ doc.add_heading(title.content, level=1)
308
 
309
  # Continuous combine paragraphs, tables, and selection marks in the order they appear on the page
310
  elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
311
  for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
 
 
312
 
313
  # Sort elements by the sum of their horizontal and vertical positions on the page
314
  elements.sort(key=lambda x: x[0])
 
320
  # Skip lines that are part of a table
321
  if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
322
  continue
323
+ content = element.content.replace(":selected:", "").replace(":unselected:", "")
324
+ doc.add_paragraph(content)
325
  elif element_type == 'table':
326
  table = doc.add_table(rows=element.row_count, cols=element.column_count)
327
  table.style = 'Table Grid'
 
331
  cell_content = ""
332
  for cell in element.cells:
333
  if cell.row_index == row_idx and cell.column_index == col_idx:
334
+ cell_content = cell.content.replace(":selected:", "").replace(":unselected:", "")
335
  table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
336
  break
337
  row_cells[col_idx].text = cell_content
 
 
 
 
 
 
 
 
338
  # Save Word document
339
  doc.save(output_file)
340
 
 
345
  return ", ".join([f"[{p.x}, {p.y}]" for p in polygon])
346
 
347
  def get_table_max_polygon(table):
348
+ # first coordination
349
  first_cell = table.cells[0]
350
  first_coordinate = first_cell.bounding_regions[0].polygon[0]
351
+
352
+ # last coordination
353
  last_cell = table.cells[-1]
354
+ last_coordinate = last_cell.bounding_regions[0].polygon[2]
355
+
356
+ # return max polygon
357
  return [first_coordinate, last_coordinate]
358
 
359
  def is_element_inside_table(element, table_max_polygon):
360
+ # midpoint of the cell is inside table
361
+ element_x = (element.bounding_regions[0].polygon[0].x + element.bounding_regions[0].polygon[2].x)/2
362
+ element_y = (element.bounding_regions[0].polygon[0].y + element.bounding_regions[0].polygon[2].y)/2
363
  first_coordinate = table_max_polygon[0]
364
+ last_coordinate = table_max_polygon[1] # no.3 and no.4 coordination!!!! need help here correct error
365
 
366
  return (first_coordinate.x <= element_x <= last_coordinate.x and
367
+ first_coordinate.y <= element_y <= last_coordinate.y)
requirements.txt CHANGED
@@ -10,4 +10,5 @@ azure-ai-formrecognizer==3.3.0
10
  python-dotenv==1.0.0
11
  python-docx==1.1.0
12
  pandas==2.1.4
13
- openpyxl==3.1.5
 
 
10
  python-dotenv==1.0.0
11
  python-docx==1.1.0
12
  pandas==2.1.4
13
+ openpyxl==3.1.5
14
+ xlsxwriter=3.2.2