Pclanglais commited on
Commit
b97dcbf
·
verified ·
1 Parent(s): f4adb15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -0
app.py CHANGED
@@ -66,6 +66,31 @@ def split_text(text, max_tokens=500):
66
 
67
  return chunks
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def remove_punctuation(text):
70
  return re.sub(r'[^\w\s]', '', text)
71
 
@@ -165,6 +190,9 @@ class CombinedProcessor:
165
 
166
  bibtex_entry = create_bibtex_entry(bibtex_data)
167
  bibtex_entries.append(bibtex_entry)
 
 
 
168
 
169
  # Join BibTeX entries with HTML formatting
170
  formatted_entries = [html.escape(entry) for entry in bibtex_entries]
 
66
 
67
  return chunks
68
 
69
+ def disambiguate_bibtex_ids(bibtex_entries):
70
+ id_count = {}
71
+ disambiguated_entries = []
72
+
73
+ for entry in bibtex_entries:
74
+ # Extract the current ID
75
+ match = re.search(r'@\w+{(\w+),', entry)
76
+ if not match:
77
+ disambiguated_entries.append(entry)
78
+ continue
79
+
80
+ original_id = match.group(1)
81
+
82
+ # Check if this ID has been seen before
83
+ if original_id in id_count:
84
+ id_count[original_id] += 1
85
+ new_id = f"{original_id}{chr(96 + id_count[original_id])}" # 'a', 'b', 'c', etc.
86
+ new_entry = re.sub(r'(@\w+{)(\w+)(,)', f'\\1{new_id}\\3', entry, 1)
87
+ disambiguated_entries.append(new_entry)
88
+ else:
89
+ id_count[original_id] = 0
90
+ disambiguated_entries.append(entry)
91
+
92
+ return disambiguated_entries
93
+
94
  def remove_punctuation(text):
95
  return re.sub(r'[^\w\s]', '', text)
96
 
 
190
 
191
  bibtex_entry = create_bibtex_entry(bibtex_data)
192
  bibtex_entries.append(bibtex_entry)
193
+
194
+ #Disambiguation to avoid duplicate ids.
195
+ bibtex_entries = disambiguate_bibtex_ids(bibtex_entries)
196
 
197
  # Join BibTeX entries with HTML formatting
198
  formatted_entries = [html.escape(entry) for entry in bibtex_entries]