File size: 6,135 Bytes
d9b7d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple


import re

class TextExtractor:
    def __init__(self) -> None:
        pass
    
    @staticmethod
    def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
        """
        This function return the fonts information inside the pdf such as size and type.

        Args:
            doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
            granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.

        Raises:
            ValueError: Raises Value Error if there are no font detected

        Returns:
            Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
        """
        styles = {}
        font_counts = {}

        for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
            identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
            styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
            font_counts[identifier] = font_counts.get(identifier, 0) + 1
        font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)

        if not font_counts:
            raise ValueError("Zero discriminating fonts found!")

        return font_counts, styles

    @staticmethod
    def get_font_tags(font_counts, styles):
        """
        _summary_

        Args:
            font_counts (_type_): _description_
            styles (_type_): _description_

        Returns:
            _type_: _description_
        """
        p_size = styles[font_counts[0][0]]['size']
        # sorting the font sizes high to low, so that we can append the right integer to each tag 
        font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
        size_tag = {p_size: "<p>"}
        for i, size in enumerate(font_sizes):
            if size > p_size:
                size_tag[size] = f"<h{i+1}>"
            elif size < p_size:
                size_tag[size] = f"<s{i+1}>"
        return size_tag
    
    @staticmethod
    def assign_tags(doc, size_tag):
        """
        Scrapes headers & paragraphs from PDF and return texts with element tags.

        Args:
            doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
            size_tag (dict): Textual element tags for each size.
        Returns:
            list: Texts with pre-prended element tags
        """
        texts = []
        previous_s = {}
        block_string = ""
        for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
            block_string = ""
            for l in b["lines"]:
                for s in l["spans"]:
                    text = re.sub(r"[^\w\s]", '', s["text"]).strip()
                    if text:
                        if not previous_s: # First Span
                            previous_s = s
                            block_string = size_tag[s['size']] + s['text']                       
                        elif s['size'] == previous_s['size']:
                            if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
                                block_string = size_tag[s['size']] + s['text']    
                            else:  # in the same block, so concatenate strings
                                block_string += f" {s['text']}"
                        else:
                            texts.append(block_string)
                            block_string = size_tag[s['size']] + s['text']
                        previous_s = s
                if block_string:
                    block_string += "|"
            # if block_string:
            texts.append(block_string)
        return texts
    
    @staticmethod
    def get_slides(texts):
        slides = {}
        section = []
        page = 1

        current_header = ""
        for text, next_text in zip(texts, texts[1:] + [None]):
            tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
            if tag_match:
                tag = tag_match.group()
                if tag == 'h1':
                    section = []
                    section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('h'): # non h1 headers
                    # Remove tag and pipes from the text  
                    section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('p'):
                    text = re.split("((\|){2,})", text)
                    for paragraph in text:
                        paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()                
                        if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
                            my_list = list(section[-1])
                            my_list[1] += f" {paragraph}"
                            my_tuple = tuple(my_list)
                            section[-1] = my_tuple # Append back the concatenated paragraph back to the section
                        elif paragraph:
                            paragraph = re.sub(' +', ' ', paragraph)
                            section.append((tag, paragraph))
                try:
                    if next_text is None:
                        slides[f"Page {page}"] = section
                        page += 1
                    elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
                        slides[f"Page {page}"] = section
                        page += 1
                except:
                    continue         
        return slides