Davidsamuel101 commited on
Commit
b49f075
·
1 Parent(s): f262292

Fix get_slides function in text_extractor.py

Browse files
Sample Document.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Sample Document Presentation
3
+ ============================
4
+
5
+
6
+ ---
7
+ # Group Members:
8
+
9
+
10
+ Stella Shania Mintara, David Samuel and Egivenia.
11
+
12
+ ---
13
+ # Case Problem
14
+
15
+
16
+ FreshMart is already well-established, they have enough resources to buy and own servers. They prefer to outsource the server management to another party so they don’t need to search and hire talents to run and manage the servers.
17
+
18
+ ---
19
+ # I. Data Source
20
+
21
+
22
+ The data source is obtained from Fresh Mart’s surveillance cameras. This data will be ingested in the ingestion layer using Apache Kafka.
__pycache__/app.cpython-38.pyc CHANGED
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
 
__pycache__/text_extractor.cpython-38.pyc CHANGED
Binary files a/__pycache__/text_extractor.cpython-38.pyc and b/__pycache__/text_extractor.cpython-38.pyc differ
 
app.py CHANGED
@@ -25,9 +25,11 @@ def summarize(slides):
25
  for idx, (tag, content) in enumerate(contents):
26
  if tag.startswith('p'):
27
  try:
 
28
  input = tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(device)
29
  tensor = model.generate(**input)
30
  summary = tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
 
31
  contents[idx] = (tag, summary)
32
  except Exception as e:
33
  print(e)
@@ -56,7 +58,6 @@ def inference(document):
56
  global FILENAME
57
  doc = fitz.open(document)
58
  FILENAME = Path(doc.name).stem
59
- print(FILENAME)
60
  font_counts, styles = preprocess.get_font_info(doc, granularity=False)
61
  size_tag = preprocess.get_font_tags(font_counts, styles)
62
  texts = preprocess.assign_tags(doc, size_tag)
 
25
  for idx, (tag, content) in enumerate(contents):
26
  if tag.startswith('p'):
27
  try:
28
+ print(f"Content: {content}")
29
  input = tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(device)
30
  tensor = model.generate(**input)
31
  summary = tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
32
+ print(f"Summary: {summary}")
33
  contents[idx] = (tag, summary)
34
  except Exception as e:
35
  print(e)
 
58
  global FILENAME
59
  doc = fitz.open(document)
60
  FILENAME = Path(doc.name).stem
 
61
  font_counts, styles = preprocess.get_font_info(doc, granularity=False)
62
  size_tag = preprocess.get_font_tags(font_counts, styles)
63
  texts = preprocess.assign_tags(doc, size_tag)
text_extractor.py CHANGED
@@ -126,13 +126,10 @@ class TextExtractor:
126
  my_tuple = tuple(my_list)
127
  section[-1] = my_tuple # Append back the concatenated paragraph back to the section
128
  elif paragraph:
129
- paragraph = re.sub(' +', ' ', paragraph)
130
  section.append((tag, paragraph))
131
  try:
132
- if next_text is None:
133
- slides[f"Page {page}"] = section
134
- page += 1
135
- elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
136
  slides[f"Page {page}"] = section
137
  page += 1
138
  except:
 
126
  my_tuple = tuple(my_list)
127
  section[-1] = my_tuple # Append back the concatenated paragraph back to the section
128
  elif paragraph:
129
+ paragraph = re.sub(' +', ' ', paragraph) # Replace any double space in the paragraph
130
  section.append((tag, paragraph))
131
  try:
132
+ if re.search(r'(?<=<)(.*?)(?=>)', text).group() == 'h1': # Create new page when current text is a tpye 1 header or title
 
 
 
133
  slides[f"Page {page}"] = section
134
  page += 1
135
  except: