Spaces:
Sleeping
Sleeping
more pdf text parsing
Browse files
app.py
CHANGED
@@ -23,13 +23,24 @@ client_anthropic = Anthropic(api_key=anthropic_api_key)
|
|
23 |
|
24 |
def parse_quotes(input_string, pdf_text):
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Find all matches of <quote>...</quote> and extract the content between the tags
|
29 |
matches = re.findall(r'<quote>(.*?)</quote>', input_string)
|
30 |
|
31 |
count = len(matches)
|
32 |
extracted_texts = matches
|
|
|
|
|
|
|
33 |
match_count = sum(1 for text in extracted_texts if text in pdf_text_wo_header)
|
34 |
|
35 |
return count, match_count
|
|
|
23 |
|
24 |
def parse_quotes(input_string, pdf_text):
|
25 |
|
26 |
+
header_pattern = r'\d+Under review as a conference paper at ICLR 2024'
|
27 |
+
#r'^\d+Under review as a conference paper at ICLR 2024\s*' # Pattern to match headers with variable page numbers
|
28 |
+
# r'(?m)^Under review as a conference paper at ICLR 2024*$'
|
29 |
+
|
30 |
+
pdf_text_wo_header = re.sub(header_pattern, '', pdf_text)
|
31 |
+
# Remove new lines that are not followed by a period, exclamation mark, or question mark
|
32 |
+
pdf_text_wo_header = re.sub(r'(?<!\.\s)(?<!\!\s)(?<!\?\s)\n+', ' ', pdf_text_wo_header)
|
33 |
+
# Remove extra spaces that may have been introduced
|
34 |
+
pdf_text_wo_header = re.sub(r'\s{2,}', ' ', pdf_text_wo_header).strip()
|
35 |
|
36 |
# Find all matches of <quote>...</quote> and extract the content between the tags
|
37 |
matches = re.findall(r'<quote>(.*?)</quote>', input_string)
|
38 |
|
39 |
count = len(matches)
|
40 |
extracted_texts = matches
|
41 |
+
# with open('pdf_text.txt', 'w') as file:
|
42 |
+
# file.write(pdf_text_wo_header)
|
43 |
+
# print(extracted_texts)
|
44 |
match_count = sum(1 for text in extracted_texts if text in pdf_text_wo_header)
|
45 |
|
46 |
return count, match_count
|