Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -66,6 +66,31 @@ def split_text(text, max_tokens=500):
|
|
66 |
|
67 |
return chunks
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def remove_punctuation(text):
|
70 |
return re.sub(r'[^\w\s]', '', text)
|
71 |
|
@@ -165,6 +190,9 @@ class CombinedProcessor:
|
|
165 |
|
166 |
bibtex_entry = create_bibtex_entry(bibtex_data)
|
167 |
bibtex_entries.append(bibtex_entry)
|
|
|
|
|
|
|
168 |
|
169 |
# Join BibTeX entries with HTML formatting
|
170 |
formatted_entries = [html.escape(entry) for entry in bibtex_entries]
|
|
|
66 |
|
67 |
return chunks
|
68 |
|
69 |
+
def disambiguate_bibtex_ids(bibtex_entries):
|
70 |
+
id_count = {}
|
71 |
+
disambiguated_entries = []
|
72 |
+
|
73 |
+
for entry in bibtex_entries:
|
74 |
+
# Extract the current ID
|
75 |
+
match = re.search(r'@\w+{(\w+),', entry)
|
76 |
+
if not match:
|
77 |
+
disambiguated_entries.append(entry)
|
78 |
+
continue
|
79 |
+
|
80 |
+
original_id = match.group(1)
|
81 |
+
|
82 |
+
# Check if this ID has been seen before
|
83 |
+
if original_id in id_count:
|
84 |
+
id_count[original_id] += 1
|
85 |
+
new_id = f"{original_id}{chr(96 + id_count[original_id])}" # 'a', 'b', 'c', etc.
|
86 |
+
new_entry = re.sub(r'(@\w+{)(\w+)(,)', f'\\1{new_id}\\3', entry, 1)
|
87 |
+
disambiguated_entries.append(new_entry)
|
88 |
+
else:
|
89 |
+
id_count[original_id] = 0
|
90 |
+
disambiguated_entries.append(entry)
|
91 |
+
|
92 |
+
return disambiguated_entries
|
93 |
+
|
94 |
def remove_punctuation(text):
|
95 |
return re.sub(r'[^\w\s]', '', text)
|
96 |
|
|
|
190 |
|
191 |
bibtex_entry = create_bibtex_entry(bibtex_data)
|
192 |
bibtex_entries.append(bibtex_entry)
|
193 |
+
|
194 |
+
#Disambiguation to avoid duplicate ids.
|
195 |
+
bibtex_entries = disambiguate_bibtex_ids(bibtex_entries)
|
196 |
|
197 |
# Join BibTeX entries with HTML formatting
|
198 |
formatted_entries = [html.escape(entry) for entry in bibtex_entries]
|