siddhartharya commited on
Commit
0b28455
·
verified ·
1 Parent(s): 2cb8b24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -43
app.py CHANGED
@@ -8,9 +8,10 @@ from sentence_transformers import SentenceTransformer
8
  import faiss
9
  import numpy as np
10
  import pandas as pd
 
 
11
 
12
  # Initialize models and variables
13
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") # Using a smaller model for resource efficiency
14
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
15
  faiss_index = None
16
  bookmarks = []
@@ -21,56 +22,73 @@ def parse_bookmarks(file_content):
21
  extracted_bookmarks = []
22
  for link in soup.find_all('a'):
23
  url = link.get('href')
24
- title = link.text
25
  if url and title:
26
  extracted_bookmarks.append({'url': url, 'title': title})
27
  return extracted_bookmarks
28
 
29
- def fetch_url_info(bookmark):
30
  url = bookmark['url']
31
  if url in fetch_cache:
32
  bookmark.update(fetch_cache[url])
33
  return bookmark
34
 
35
  try:
36
- response = requests.get(url, timeout=5)
37
- bookmark['etag'] = response.headers.get('ETag', 'N/A')
38
- bookmark['status_code'] = response.status_code
39
-
40
- if response.status_code >= 400:
41
- bookmark['dead_link'] = True
42
- bookmark['content'] = ''
43
- else:
44
- bookmark['dead_link'] = False
45
- soup = BeautifulSoup(response.content, 'html.parser')
46
- meta_tags = {meta.get('name', ''): meta.get('content', '') for meta in soup.find_all('meta')}
47
- bookmark['meta_tags'] = meta_tags
48
- bookmark['content'] = soup.get_text(separator=' ', strip=True)
 
 
 
 
 
 
 
 
 
 
49
  except Exception as e:
50
  bookmark['dead_link'] = True
51
  bookmark['etag'] = 'N/A'
52
  bookmark['status_code'] = 'N/A'
53
- bookmark['meta_tags'] = {}
54
- bookmark['content'] = ''
55
  finally:
56
  fetch_cache[url] = {
57
  'etag': bookmark.get('etag'),
58
  'status_code': bookmark.get('status_code'),
59
  'dead_link': bookmark.get('dead_link'),
60
- 'meta_tags': bookmark.get('meta_tags'),
61
- 'content': bookmark.get('content'),
62
  }
63
  return bookmark
64
 
 
 
 
 
 
 
 
 
65
  def generate_summary(bookmark):
66
- content = bookmark.get('content', '')
67
- if content:
68
- # Limit content to first 500 characters to save resources
69
- content = content[:500]
70
- summary = summarizer(content, max_length=50, min_length=25, do_sample=False)
71
- bookmark['summary'] = summary[0]['summary_text']
72
  else:
73
- bookmark['summary'] = 'No content available to summarize.'
 
 
 
 
74
  return bookmark
75
 
76
  def vectorize_and_index(bookmarks):
@@ -83,9 +101,10 @@ def vectorize_and_index(bookmarks):
83
 
84
  def display_bookmarks():
85
  data = []
 
86
  for i, bookmark in enumerate(bookmarks):
87
  status = "Dead Link" if bookmark.get('dead_link') else "Active"
88
- css_class = "dead-link" if bookmark.get('dead_link') else ""
89
  data.append({
90
  'Index': i,
91
  'Title': bookmark['title'],
@@ -93,17 +112,17 @@ def display_bookmarks():
93
  'Status': status,
94
  'ETag': bookmark.get('etag', 'N/A'),
95
  'Summary': bookmark.get('summary', ''),
96
- 'css_class': css_class
97
  })
 
98
  df = pd.DataFrame(data)
99
- return df
 
100
 
101
  def process_uploaded_file(file):
102
  global bookmarks, faiss_index
103
  if file is None:
104
  return "Please upload a bookmarks HTML file.", pd.DataFrame()
105
  try:
106
- # Decode the binary data to a string
107
  file_content = file.decode('utf-8')
108
  except UnicodeDecodeError:
109
  return "Error decoding the file. Please ensure it's a valid HTML file.", pd.DataFrame()
@@ -113,14 +132,17 @@ def process_uploaded_file(file):
113
  if not bookmarks:
114
  return "No bookmarks found in the uploaded file.", pd.DataFrame()
115
 
 
 
 
 
116
  for bookmark in bookmarks:
117
- fetch_url_info(bookmark)
118
  generate_summary(bookmark)
119
 
120
  faiss_index, embeddings = vectorize_and_index(bookmarks)
121
  message = f"Successfully processed {len(bookmarks)} bookmarks."
122
- bookmark_df = display_bookmarks()
123
- return message, bookmark_df
124
 
125
  def chatbot_response(user_query):
126
  if faiss_index is None or not bookmarks:
@@ -146,13 +168,14 @@ def edit_bookmark(bookmark_idx, new_title, new_url):
146
  return "Invalid bookmark index.", display_bookmarks()
147
  bookmarks[bookmark_idx]['title'] = new_title
148
  bookmarks[bookmark_idx]['url'] = new_url
149
- fetch_url_info(bookmarks[bookmark_idx])
 
150
  generate_summary(bookmarks[bookmark_idx])
151
  # Rebuild the FAISS index
152
  faiss_index, embeddings = vectorize_and_index(bookmarks)
153
  message = "Bookmark updated successfully."
154
- updated_df = display_bookmarks()
155
- return message, updated_df
156
  except Exception as e:
157
  return f"Error: {str(e)}", display_bookmarks()
158
 
@@ -169,8 +192,8 @@ def delete_bookmark(bookmark_idx):
169
  else:
170
  faiss_index = None
171
  message = "Bookmark deleted successfully."
172
- updated_df = display_bookmarks()
173
- return message, updated_df
174
  except Exception as e:
175
  return f"Error: {str(e)}", display_bookmarks()
176
 
@@ -185,8 +208,7 @@ def build_app():
185
  bookmark_table = gr.HTML(label="Bookmarks")
186
 
187
  def update_bookmark_table(file):
188
- message, df = process_uploaded_file(file)
189
- html_table = df.to_html(escape=False, index=False)
190
  return message, html_table
191
 
192
  process_button.click(
@@ -220,8 +242,7 @@ def build_app():
220
  delete_button = gr.Button("Delete Bookmark")
221
 
222
  def update_manage_table():
223
- df = display_bookmarks()
224
- html_table = df.to_html(escape=False, index=False)
225
  return html_table
226
 
227
  refresh_button.click(
 
8
  import faiss
9
  import numpy as np
10
  import pandas as pd
11
+ import asyncio
12
+ import aiohttp
13
 
14
  # Initialize models and variables
 
15
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
16
  faiss_index = None
17
  bookmarks = []
 
22
  extracted_bookmarks = []
23
  for link in soup.find_all('a'):
24
  url = link.get('href')
25
+ title = link.text.strip()
26
  if url and title:
27
  extracted_bookmarks.append({'url': url, 'title': title})
28
  return extracted_bookmarks
29
 
30
+ async def fetch_url_info(session, bookmark):
31
  url = bookmark['url']
32
  if url in fetch_cache:
33
  bookmark.update(fetch_cache[url])
34
  return bookmark
35
 
36
  try:
37
+ async with session.get(url, timeout=5) as response:
38
+ bookmark['etag'] = response.headers.get('ETag', 'N/A')
39
+ bookmark['status_code'] = response.status
40
+
41
+ if response.status >= 400:
42
+ bookmark['dead_link'] = True
43
+ bookmark['description'] = ''
44
+ else:
45
+ bookmark['dead_link'] = False
46
+ content = await response.text()
47
+ soup = BeautifulSoup(content, 'html.parser')
48
+
49
+ # Extract meta description or Open Graph description
50
+ meta_description = soup.find('meta', attrs={'name': 'description'})
51
+ og_description = soup.find('meta', attrs={'property': 'og:description'})
52
+ if og_description and og_description.get('content'):
53
+ description = og_description.get('content')
54
+ elif meta_description and meta_description.get('content'):
55
+ description = meta_description.get('content')
56
+ else:
57
+ description = ''
58
+
59
+ bookmark['description'] = description
60
  except Exception as e:
61
  bookmark['dead_link'] = True
62
  bookmark['etag'] = 'N/A'
63
  bookmark['status_code'] = 'N/A'
64
+ bookmark['description'] = ''
 
65
  finally:
66
  fetch_cache[url] = {
67
  'etag': bookmark.get('etag'),
68
  'status_code': bookmark.get('status_code'),
69
  'dead_link': bookmark.get('dead_link'),
70
+ 'description': bookmark.get('description'),
 
71
  }
72
  return bookmark
73
 
74
+ async def process_bookmarks_async(bookmarks):
75
+ async with aiohttp.ClientSession() as session:
76
+ tasks = []
77
+ for bookmark in bookmarks:
78
+ task = asyncio.ensure_future(fetch_url_info(session, bookmark))
79
+ tasks.append(task)
80
+ await asyncio.gather(*tasks)
81
+
82
  def generate_summary(bookmark):
83
+ description = bookmark.get('description', '')
84
+ if description:
85
+ bookmark['summary'] = description
 
 
 
86
  else:
87
+ title = bookmark.get('title', '')
88
+ if title:
89
+ bookmark['summary'] = title
90
+ else:
91
+ bookmark['summary'] = 'No summary available.'
92
  return bookmark
93
 
94
  def vectorize_and_index(bookmarks):
 
101
 
102
  def display_bookmarks():
103
  data = []
104
+ classes = []
105
  for i, bookmark in enumerate(bookmarks):
106
  status = "Dead Link" if bookmark.get('dead_link') else "Active"
107
+ row_class = "dead-link" if bookmark.get('dead_link') else ""
108
  data.append({
109
  'Index': i,
110
  'Title': bookmark['title'],
 
112
  'Status': status,
113
  'ETag': bookmark.get('etag', 'N/A'),
114
  'Summary': bookmark.get('summary', ''),
 
115
  })
116
+ classes.append(row_class)
117
  df = pd.DataFrame(data)
118
+ # Return HTML with styles
119
+ return df.to_html(escape=False, index=False)
120
 
121
  def process_uploaded_file(file):
122
  global bookmarks, faiss_index
123
  if file is None:
124
  return "Please upload a bookmarks HTML file.", pd.DataFrame()
125
  try:
 
126
  file_content = file.decode('utf-8')
127
  except UnicodeDecodeError:
128
  return "Error decoding the file. Please ensure it's a valid HTML file.", pd.DataFrame()
 
132
  if not bookmarks:
133
  return "No bookmarks found in the uploaded file.", pd.DataFrame()
134
 
135
+ # Asynchronously fetch bookmark info
136
+ asyncio.run(process_bookmarks_async(bookmarks))
137
+
138
+ # Generate summaries using descriptions
139
  for bookmark in bookmarks:
 
140
  generate_summary(bookmark)
141
 
142
  faiss_index, embeddings = vectorize_and_index(bookmarks)
143
  message = f"Successfully processed {len(bookmarks)} bookmarks."
144
+ bookmark_html = display_bookmarks()
145
+ return message, bookmark_html
146
 
147
  def chatbot_response(user_query):
148
  if faiss_index is None or not bookmarks:
 
168
  return "Invalid bookmark index.", display_bookmarks()
169
  bookmarks[bookmark_idx]['title'] = new_title
170
  bookmarks[bookmark_idx]['url'] = new_url
171
+ # Re-fetch bookmark info
172
+ asyncio.run(process_bookmarks_async([bookmarks[bookmark_idx]]))
173
  generate_summary(bookmarks[bookmark_idx])
174
  # Rebuild the FAISS index
175
  faiss_index, embeddings = vectorize_and_index(bookmarks)
176
  message = "Bookmark updated successfully."
177
+ updated_html = display_bookmarks()
178
+ return message, updated_html
179
  except Exception as e:
180
  return f"Error: {str(e)}", display_bookmarks()
181
 
 
192
  else:
193
  faiss_index = None
194
  message = "Bookmark deleted successfully."
195
+ updated_html = display_bookmarks()
196
+ return message, updated_html
197
  except Exception as e:
198
  return f"Error: {str(e)}", display_bookmarks()
199
 
 
208
  bookmark_table = gr.HTML(label="Bookmarks")
209
 
210
  def update_bookmark_table(file):
211
+ message, html_table = process_uploaded_file(file)
 
212
  return message, html_table
213
 
214
  process_button.click(
 
242
  delete_button = gr.Button("Delete Bookmark")
243
 
244
  def update_manage_table():
245
+ html_table = display_bookmarks()
 
246
  return html_table
247
 
248
  refresh_button.click(