Spaces:
Runtime error
Runtime error
Commit
·
b49f075
1
Parent(s):
f262292
Fix get_slides function in text_extractor.py
Browse files- Sample Document.md +22 -0
- __pycache__/app.cpython-38.pyc +0 -0
- __pycache__/text_extractor.cpython-38.pyc +0 -0
- app.py +2 -1
- text_extractor.py +2 -5
Sample Document.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
Sample Document Presentation
|
3 |
+
============================
|
4 |
+
|
5 |
+
|
6 |
+
---
|
7 |
+
# Group Members:
|
8 |
+
|
9 |
+
|
10 |
+
Stella Shania Mintara, David Samuel and Egivenia.
|
11 |
+
|
12 |
+
---
|
13 |
+
# Case Problem
|
14 |
+
|
15 |
+
|
16 |
+
FreshMart is already well-established, they have enough resources to buy and own servers. They prefer to outsource the server management to another party so they don’t need to search and hire talents to run and manage the servers.
|
17 |
+
|
18 |
+
---
|
19 |
+
# I. Data Source
|
20 |
+
|
21 |
+
|
22 |
+
The data source is obtained from Fresh Mart’s surveillance cameras. This data will be ingested in the ingestion layer using Apache Kafka.
|
__pycache__/app.cpython-38.pyc
CHANGED
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
|
|
__pycache__/text_extractor.cpython-38.pyc
CHANGED
Binary files a/__pycache__/text_extractor.cpython-38.pyc and b/__pycache__/text_extractor.cpython-38.pyc differ
|
|
app.py
CHANGED
@@ -25,9 +25,11 @@ def summarize(slides):
|
|
25 |
for idx, (tag, content) in enumerate(contents):
|
26 |
if tag.startswith('p'):
|
27 |
try:
|
|
|
28 |
input = tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(device)
|
29 |
tensor = model.generate(**input)
|
30 |
summary = tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
|
|
|
31 |
contents[idx] = (tag, summary)
|
32 |
except Exception as e:
|
33 |
print(e)
|
@@ -56,7 +58,6 @@ def inference(document):
|
|
56 |
global FILENAME
|
57 |
doc = fitz.open(document)
|
58 |
FILENAME = Path(doc.name).stem
|
59 |
-
print(FILENAME)
|
60 |
font_counts, styles = preprocess.get_font_info(doc, granularity=False)
|
61 |
size_tag = preprocess.get_font_tags(font_counts, styles)
|
62 |
texts = preprocess.assign_tags(doc, size_tag)
|
|
|
25 |
for idx, (tag, content) in enumerate(contents):
|
26 |
if tag.startswith('p'):
|
27 |
try:
|
28 |
+
print(f"Content: {content}")
|
29 |
input = tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(device)
|
30 |
tensor = model.generate(**input)
|
31 |
summary = tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
|
32 |
+
print(f"Summary: {summary}")
|
33 |
contents[idx] = (tag, summary)
|
34 |
except Exception as e:
|
35 |
print(e)
|
|
|
58 |
global FILENAME
|
59 |
doc = fitz.open(document)
|
60 |
FILENAME = Path(doc.name).stem
|
|
|
61 |
font_counts, styles = preprocess.get_font_info(doc, granularity=False)
|
62 |
size_tag = preprocess.get_font_tags(font_counts, styles)
|
63 |
texts = preprocess.assign_tags(doc, size_tag)
|
text_extractor.py
CHANGED
@@ -126,13 +126,10 @@ class TextExtractor:
|
|
126 |
my_tuple = tuple(my_list)
|
127 |
section[-1] = my_tuple # Append back the concatenated paragraph back to the section
|
128 |
elif paragraph:
|
129 |
-
paragraph = re.sub(' +', ' ', paragraph)
|
130 |
section.append((tag, paragraph))
|
131 |
try:
|
132 |
-
if
|
133 |
-
slides[f"Page {page}"] = section
|
134 |
-
page += 1
|
135 |
-
elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
|
136 |
slides[f"Page {page}"] = section
|
137 |
page += 1
|
138 |
except:
|
|
|
126 |
my_tuple = tuple(my_list)
|
127 |
section[-1] = my_tuple # Append back the concatenated paragraph back to the section
|
128 |
elif paragraph:
|
129 |
+
paragraph = re.sub(' +', ' ', paragraph) # Replace any double space in the paragraph
|
130 |
section.append((tag, paragraph))
|
131 |
try:
|
132 |
+
if re.search(r'(?<=<)(.*?)(?=>)', text).group() == 'h1': # Create new page when current text is a tpye 1 header or title
|
|
|
|
|
|
|
133 |
slides[f"Page {page}"] = section
|
134 |
page += 1
|
135 |
except:
|