Spaces:
Runtime error
Runtime error
Commit
·
d9b7d2f
1
Parent(s):
f0a8738
Add text extractor
Browse files- text_extractor.py +140 -0
text_extractor.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from operator import itemgetter
|
| 2 |
+
from collections import OrderedDict
|
| 3 |
+
from typing import Dict, List, Iterator, Union, Tuple
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
class TextExtractor:
|
| 9 |
+
def __init__(self) -> None:
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
@staticmethod
|
| 13 |
+
def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
|
| 14 |
+
"""
|
| 15 |
+
This function return the fonts information inside the pdf such as size and type.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
|
| 19 |
+
granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.
|
| 20 |
+
|
| 21 |
+
Raises:
|
| 22 |
+
ValueError: Raises Value Error if there are no font detected
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
|
| 26 |
+
"""
|
| 27 |
+
styles = {}
|
| 28 |
+
font_counts = {}
|
| 29 |
+
|
| 30 |
+
for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
|
| 31 |
+
identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
|
| 32 |
+
styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
|
| 33 |
+
font_counts[identifier] = font_counts.get(identifier, 0) + 1
|
| 34 |
+
font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)
|
| 35 |
+
|
| 36 |
+
if not font_counts:
|
| 37 |
+
raise ValueError("Zero discriminating fonts found!")
|
| 38 |
+
|
| 39 |
+
return font_counts, styles
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def get_font_tags(font_counts, styles):
|
| 43 |
+
"""
|
| 44 |
+
_summary_
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
font_counts (_type_): _description_
|
| 48 |
+
styles (_type_): _description_
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
_type_: _description_
|
| 52 |
+
"""
|
| 53 |
+
p_size = styles[font_counts[0][0]]['size']
|
| 54 |
+
# sorting the font sizes high to low, so that we can append the right integer to each tag
|
| 55 |
+
font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
|
| 56 |
+
size_tag = {p_size: "<p>"}
|
| 57 |
+
for i, size in enumerate(font_sizes):
|
| 58 |
+
if size > p_size:
|
| 59 |
+
size_tag[size] = f"<h{i+1}>"
|
| 60 |
+
elif size < p_size:
|
| 61 |
+
size_tag[size] = f"<s{i+1}>"
|
| 62 |
+
return size_tag
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def assign_tags(doc, size_tag):
|
| 66 |
+
"""
|
| 67 |
+
Scrapes headers & paragraphs from PDF and return texts with element tags.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
|
| 71 |
+
size_tag (dict): Textual element tags for each size.
|
| 72 |
+
Returns:
|
| 73 |
+
list: Texts with pre-prended element tags
|
| 74 |
+
"""
|
| 75 |
+
texts = []
|
| 76 |
+
previous_s = {}
|
| 77 |
+
block_string = ""
|
| 78 |
+
for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
|
| 79 |
+
block_string = ""
|
| 80 |
+
for l in b["lines"]:
|
| 81 |
+
for s in l["spans"]:
|
| 82 |
+
text = re.sub(r"[^\w\s]", '', s["text"]).strip()
|
| 83 |
+
if text:
|
| 84 |
+
if not previous_s: # First Span
|
| 85 |
+
previous_s = s
|
| 86 |
+
block_string = size_tag[s['size']] + s['text']
|
| 87 |
+
elif s['size'] == previous_s['size']:
|
| 88 |
+
if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
|
| 89 |
+
block_string = size_tag[s['size']] + s['text']
|
| 90 |
+
else: # in the same block, so concatenate strings
|
| 91 |
+
block_string += f" {s['text']}"
|
| 92 |
+
else:
|
| 93 |
+
texts.append(block_string)
|
| 94 |
+
block_string = size_tag[s['size']] + s['text']
|
| 95 |
+
previous_s = s
|
| 96 |
+
if block_string:
|
| 97 |
+
block_string += "|"
|
| 98 |
+
# if block_string:
|
| 99 |
+
texts.append(block_string)
|
| 100 |
+
return texts
|
| 101 |
+
|
| 102 |
+
@staticmethod
|
| 103 |
+
def get_slides(texts):
|
| 104 |
+
slides = {}
|
| 105 |
+
section = []
|
| 106 |
+
page = 1
|
| 107 |
+
|
| 108 |
+
current_header = ""
|
| 109 |
+
for text, next_text in zip(texts, texts[1:] + [None]):
|
| 110 |
+
tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
|
| 111 |
+
if tag_match:
|
| 112 |
+
tag = tag_match.group()
|
| 113 |
+
if tag == 'h1':
|
| 114 |
+
section = []
|
| 115 |
+
section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
|
| 116 |
+
elif tag.startswith('h'): # non h1 headers
|
| 117 |
+
# Remove tag and pipes from the text
|
| 118 |
+
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
|
| 119 |
+
elif tag.startswith('p'):
|
| 120 |
+
text = re.split("((\|){2,})", text)
|
| 121 |
+
for paragraph in text:
|
| 122 |
+
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
|
| 123 |
+
if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
|
| 124 |
+
my_list = list(section[-1])
|
| 125 |
+
my_list[1] += f" {paragraph}"
|
| 126 |
+
my_tuple = tuple(my_list)
|
| 127 |
+
section[-1] = my_tuple # Append back the concatenated paragraph back to the section
|
| 128 |
+
elif paragraph:
|
| 129 |
+
paragraph = re.sub(' +', ' ', paragraph)
|
| 130 |
+
section.append((tag, paragraph))
|
| 131 |
+
try:
|
| 132 |
+
if next_text is None:
|
| 133 |
+
slides[f"Page {page}"] = section
|
| 134 |
+
page += 1
|
| 135 |
+
elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
|
| 136 |
+
slides[f"Page {page}"] = section
|
| 137 |
+
page += 1
|
| 138 |
+
except:
|
| 139 |
+
continue
|
| 140 |
+
return slides
|