nityathakkar commited on
Commit
262ae19
·
verified ·
1 Parent(s): 6834f57

more pdf text parsing

Browse files
Files changed (1) hide show
  1. app.py +12 -1
app.py CHANGED
@@ -23,13 +23,24 @@ client_anthropic = Anthropic(api_key=anthropic_api_key)
23
 
24
  def parse_quotes(input_string, pdf_text):
25
 
26
- pdf_text_wo_header = re.sub(r'(?m)^Under review as a conference paper at ICLR 2024*$', '', pdf_text)
 
 
 
 
 
 
 
 
27
 
28
  # Find all matches of <quote>...</quote> and extract the content between the tags
29
  matches = re.findall(r'<quote>(.*?)</quote>', input_string)
30
 
31
  count = len(matches)
32
  extracted_texts = matches
 
 
 
33
  match_count = sum(1 for text in extracted_texts if text in pdf_text_wo_header)
34
 
35
  return count, match_count
 
23
 
24
  def parse_quotes(input_string, pdf_text):
25
 
26
+ header_pattern = r'\d+Under review as a conference paper at ICLR 2024'
27
+ #r'^\d+Under review as a conference paper at ICLR 2024\s*' # Pattern to match headers with variable page numbers
28
+ # r'(?m)^Under review as a conference paper at ICLR 2024*$'
29
+
30
+ pdf_text_wo_header = re.sub(header_pattern, '', pdf_text)
31
+ # Remove new lines that are not followed by a period, exclamation mark, or question mark
32
+ pdf_text_wo_header = re.sub(r'(?<!\.\s)(?<!\!\s)(?<!\?\s)\n+', ' ', pdf_text_wo_header)
33
+ # Remove extra spaces that may have been introduced
34
+ pdf_text_wo_header = re.sub(r'\s{2,}', ' ', pdf_text_wo_header).strip()
35
 
36
  # Find all matches of <quote>...</quote> and extract the content between the tags
37
  matches = re.findall(r'<quote>(.*?)</quote>', input_string)
38
 
39
  count = len(matches)
40
  extracted_texts = matches
41
+ # with open('pdf_text.txt', 'w') as file:
42
+ # file.write(pdf_text_wo_header)
43
+ # print(extracted_texts)
44
  match_count = sum(1 for text in extracted_texts if text in pdf_text_wo_header)
45
 
46
  return count, match_count