Spaces:
Runtime error
Runtime error
File size: 6,135 Bytes
d9b7d2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple
import re
class TextExtractor:
def __init__(self) -> None:
pass
@staticmethod
def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
"""
This function return the fonts information inside the pdf such as size and type.
Args:
doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.
Raises:
ValueError: Raises Value Error if there are no font detected
Returns:
Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
"""
styles = {}
font_counts = {}
for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
font_counts[identifier] = font_counts.get(identifier, 0) + 1
font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)
if not font_counts:
raise ValueError("Zero discriminating fonts found!")
return font_counts, styles
@staticmethod
def get_font_tags(font_counts, styles):
"""
_summary_
Args:
font_counts (_type_): _description_
styles (_type_): _description_
Returns:
_type_: _description_
"""
p_size = styles[font_counts[0][0]]['size']
# sorting the font sizes high to low, so that we can append the right integer to each tag
font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
size_tag = {p_size: "<p>"}
for i, size in enumerate(font_sizes):
if size > p_size:
size_tag[size] = f"<h{i+1}>"
elif size < p_size:
size_tag[size] = f"<s{i+1}>"
return size_tag
@staticmethod
def assign_tags(doc, size_tag):
"""
Scrapes headers & paragraphs from PDF and return texts with element tags.
Args:
doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
size_tag (dict): Textual element tags for each size.
Returns:
list: Texts with pre-prended element tags
"""
texts = []
previous_s = {}
block_string = ""
for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
block_string = ""
for l in b["lines"]:
for s in l["spans"]:
text = re.sub(r"[^\w\s]", '', s["text"]).strip()
if text:
if not previous_s: # First Span
previous_s = s
block_string = size_tag[s['size']] + s['text']
elif s['size'] == previous_s['size']:
if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
block_string = size_tag[s['size']] + s['text']
else: # in the same block, so concatenate strings
block_string += f" {s['text']}"
else:
texts.append(block_string)
block_string = size_tag[s['size']] + s['text']
previous_s = s
if block_string:
block_string += "|"
# if block_string:
texts.append(block_string)
return texts
@staticmethod
def get_slides(texts):
slides = {}
section = []
page = 1
current_header = ""
for text, next_text in zip(texts, texts[1:] + [None]):
tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
if tag_match:
tag = tag_match.group()
if tag == 'h1':
section = []
section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
elif tag.startswith('h'): # non h1 headers
# Remove tag and pipes from the text
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
elif tag.startswith('p'):
text = re.split("((\|){2,})", text)
for paragraph in text:
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
my_list = list(section[-1])
my_list[1] += f" {paragraph}"
my_tuple = tuple(my_list)
section[-1] = my_tuple # Append back the concatenated paragraph back to the section
elif paragraph:
paragraph = re.sub(' +', ' ', paragraph)
section.append((tag, paragraph))
try:
if next_text is None:
slides[f"Page {page}"] = section
page += 1
elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
slides[f"Page {page}"] = section
page += 1
except:
continue
return slides |