mgbam commited on
Commit
c04089b
·
verified ·
1 Parent(s): 2f92e9e

Rename services.py to web_scraper.py

Browse files
Files changed (2) hide show
  1. services.py +0 -111
  2. web_scraper.py +237 -0
services.py DELETED
@@ -1,111 +0,0 @@
1
- # /services.py
2
- """ Manages interactions with all external LLM and search APIs. """
3
-
4
- import os
5
- import logging
6
- from typing import Dict, Any, Generator, List
7
-
8
- from dotenv import load_dotenv
9
- from huggingface_hub import InferenceClient
10
- from tavily import TavilyClient
11
- from groq import Groq
12
- import fireworks.client as Fireworks
13
- import openai
14
- import google.generativeai as genai
15
-
16
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
- load_dotenv()
18
-
19
- # --- API Keys from .env ---
20
- HF_TOKEN = os.getenv("HF_TOKEN")
21
- TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
22
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
23
- FIREWORKS_API_KEY = os.getenv("FIREWORKS_API_KEY")
24
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
25
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
26
- DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
27
-
28
- Messages = List[Dict[str, Any]]
29
-
30
- class LLMService:
31
- """A multi-provider wrapper for LLM Inference APIs."""
32
- def __init__(self):
33
- self.hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
34
- self.groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
35
- self.openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None
36
-
37
- if DEEPSEEK_API_KEY:
38
- self.deepseek_client = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/v1")
39
- else:
40
- self.deepseek_client = None
41
-
42
- if FIREWORKS_API_KEY:
43
- Fireworks.api_key = FIREWORKS_API_KEY
44
- self.fireworks_client = Fireworks
45
- else:
46
- self.fireworks_client = None
47
-
48
- if GEMINI_API_KEY:
49
- genai.configure(api_key=GEMINI_API_KEY)
50
- self.gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')
51
- else:
52
- self.gemini_model = None
53
-
54
- def _prepare_messages_for_gemini(self, messages: Messages) -> List[Dict[str, Any]]:
55
- gemini_messages = []
56
- for msg in messages:
57
- if msg['role'] == 'system': continue # Gemini doesn't use a system role in this way
58
- role = 'model' if msg['role'] == 'assistant' else 'user'
59
- gemini_messages.append({'role': role, 'parts': [msg['content']]})
60
- return gemini_messages
61
-
62
- def generate_code_stream(self, model_id: str, messages: Messages, max_tokens: int = 8192) -> Generator[str, None, None]:
63
- provider, model_name = model_id.split('/', 1)
64
- logging.info(f"Dispatching to provider: {provider} for model: {model_name}")
65
-
66
- try:
67
- if provider in ['openai', 'groq', 'deepseek', 'fireworks']:
68
- client_map = {'openai': self.openai_client, 'groq': self.groq_client, 'deepseek': self.deepseek_client, 'fireworks': self.fireworks_client.ChatCompletion if self.fireworks_client else None}
69
- client = client_map.get(provider)
70
- if not client: raise ValueError(f"{provider.capitalize()} API key not configured.")
71
-
72
- stream = client.create(model=model_name, messages=messages, stream=True, max_tokens=max_tokens) if provider == 'fireworks' else client.chat.completions.create(model=model_name, messages=messages, stream=True, max_tokens=max_tokens)
73
- for chunk in stream:
74
- if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: yield chunk.choices[0].delta.content
75
-
76
- elif provider == 'gemini':
77
- if not self.gemini_model: raise ValueError("Gemini API key not configured.")
78
- system_prompt = next((msg['content'] for msg in messages if msg['role'] == 'system'), "")
79
- gemini_messages = self._prepare_messages_for_gemini(messages)
80
- # Prepend system prompt to first user message for Gemini
81
- if system_prompt and gemini_messages and gemini_messages[0]['role'] == 'user':
82
- gemini_messages[0]['parts'][0] = f"{system_prompt}\n\n{gemini_messages[0]['parts'][0]}"
83
- stream = self.gemini_model.generate_content(gemini_messages, stream=True)
84
- for chunk in stream: yield chunk.text
85
-
86
- elif provider == 'huggingface':
87
- if not self.hf_client: raise ValueError("Hugging Face API token not configured.")
88
- hf_model_id = model_id.split('/', 1)[1]
89
- stream = self.hf_client.chat_completion(model=hf_model_id, messages=messages, stream=True, max_tokens=max_tokens)
90
- for chunk in stream:
91
- if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: yield chunk.choices[0].delta.content
92
- else:
93
- raise ValueError(f"Unknown provider: {provider}")
94
- except Exception as e:
95
- logging.error(f"LLM API Error with provider {provider}: {e}")
96
- yield f"Error from {provider.capitalize()}: {str(e)}"
97
-
98
- class SearchService:
99
- def __init__(self, api_key: str = TAVILY_API_KEY):
100
- self.client = TavilyClient(api_key=api_key) if api_key else None
101
- if not self.client: logging.warning("TAVILY_API_KEY not set. Web search will be disabled.")
102
- def is_available(self) -> bool: return self.client is not None
103
- def search(self, query: str, max_results: int = 5) -> str:
104
- if not self.is_available(): return "Web search is not available."
105
- try:
106
- response = self.client.search(query, search_depth="advanced", max_results=min(max(1, max_results), 10))
107
- return "Web Search Results:\n\n" + "\n---\n".join([f"Title: {res.get('title', 'N/A')}\nURL: {res.get('url', 'N/A')}\nContent: {res.get('content', 'N/A')}" for res in response.get('results', [])])
108
- except Exception as e: return f"Search error: {str(e)}"
109
-
110
- llm_service = LLMService()
111
- search_service = SearchService()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
web_scraper.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def extract_text_from_image(image_path):
2
+ """Extract text from image using OCR"""
3
+ try:
4
+ try:
5
+ pytesseract.get_tesseract_version()
6
+ except Exception:
7
+ return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
8
+ image = cv2.imread(image_path)
9
+ if image is None:
10
+ return "Error: Could not read image file"
11
+ image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
12
+ gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY)
13
+ _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
14
+ text=pytesseract.image_to_string(binary,config='--psm 6')
15
+ return text.strip() if text.strip() else "No text found in image"
16
+ except Exception as e:
17
+ return f"Error extracting text from image: {e}"
18
+
19
+ def extract_text_from_file(file_path):
20
+ if not file_path:
21
+ return ""
22
+ mime,_=mimetypes.guess_type(file_path)
23
+ ext=os.path.splitext(file_path)[1].lower()
24
+ try:
25
+ if ext==".pdf":
26
+ with open(file_path,"rb") as f:
27
+ reader=PyPDF2.PdfReader(f)
28
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
29
+ elif ext in [".txt", ".md"]:
30
+ with open(file_path,"r",encoding="utf-8") as f:
31
+ return f.read()
32
+ elif ext==".csv":
33
+ with open(file_path,"r",encoding="utf-8") as f:
34
+ return f.read()
35
+ elif ext==".docx":
36
+ doc=docx.Document(file_path)
37
+ return "\n".join([para.text for para in doc.paragraphs])
38
+ elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]:
39
+ return extract_text_from_image(file_path)
40
+ else:
41
+ return ""
42
+ except Exception as e:
43
+ return f"Error extracting text: {e}"
44
+
45
+ def extract_website_content(url: str) -> str:
46
+ """Extract HTML code and content from a website URL"""
47
+ try:
48
+ parsed_url=urlparse(url)
49
+ if not parsed_url.scheme:
50
+ url="https://"+url
51
+ parsed_url=urlparse(url)
52
+ if not parsed_url.netloc:
53
+ return "Error: Invalid URL provided"
54
+ headers={
55
+ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
56
+ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
57
+ 'Accept-Language':'en-US,en;q=0.9',
58
+ 'Accept-Encoding':'gzip, deflate, br',
59
+ 'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1',
60
+ 'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0'
61
+ }
62
+ session=requests.Session()
63
+ session.headers.update(headers)
64
+ max_retries=3
65
+ for attempt in range(max_retries):
66
+ try:
67
+ response=session.get(url,timeout=15,allow_redirects=True)
68
+ response.raise_for_status()
69
+ break
70
+ except requests.exceptions.HTTPError as e:
71
+ if e.response.status_code==403 and attempt<max_retries-1:
72
+ session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
73
+ continue
74
+ else:
75
+ raise
76
+ try:
77
+ response.encoding=response.apparent_encoding
78
+ raw_html=response.text
79
+ except:
80
+ raw_html=response.content.decode('utf-8',errors='ignore')
81
+ if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
82
+ try:
83
+ raw_html=response.content.decode('latin-1',errors='ignore')
84
+ except:
85
+ try:
86
+ raw_html=response.content.decode('utf-8',errors='ignore')
87
+ except:
88
+ raw_html=response.content.decode('cp1252',errors='ignore')
89
+ soup=BeautifulSoup(raw_html,'html.parser')
90
+ title=soup.find('title')
91
+ title_text=title.get_text().strip() if title else "No title found"
92
+ meta_desc=soup.find('meta',attrs={'name':'description'})
93
+ description=meta_desc.get('content','') if meta_desc else ""
94
+ content_sections=[]
95
+ main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body']
96
+ for selector in main_selectors:
97
+ elements=soup.select(selector)
98
+ for element in elements:
99
+ text=element.get_text().strip()
100
+ if len(text)>100:
101
+ content_sections.append(text)
102
+ nav_links=[]
103
+ nav_elements=soup.find_all(['nav','header'])
104
+ for nav in nav_elements:
105
+ links=nav.find_all('a')
106
+ for link in links:
107
+ link_text=link.get_text().strip()
108
+ link_href=link.get('href','')
109
+ if link_text and link_href:
110
+ nav_links.append(f"{link_text}: {link_href}")
111
+ img_elements=soup.find_all('img')
112
+ for img in img_elements:
113
+ src=img.get('src','')
114
+ if src:
115
+ if src.startswith('//'):
116
+ absolute_src='https:'+src
117
+ img['src']=absolute_src
118
+ elif src.startswith('/'):
119
+ absolute_src=urljoin(url,src)
120
+ img['src']=absolute_src
121
+ elif not src.startswith(('http://','https://')):
122
+ absolute_src=urljoin(url,src)
123
+ img['src']=absolute_src
124
+ data_src=img.get('data-src','')
125
+ if data_src and not src:
126
+ if data_src.startswith('//'):
127
+ absolute_data_src='https:'+data_src
128
+ img['src']=absolute_data_src
129
+ elif data_src.startswith('/'):
130
+ absolute_data_src=urljoin(url,data_src)
131
+ img['src']=absolute_data_src
132
+ elif not data_src.startswith(('http://','https://')):
133
+ absolute_data_src=urljoin(url,data_src)
134
+ img['src']=absolute_data_src
135
+ else:
136
+ img['src']=data_src
137
+ elements_with_style=soup.find_all(attrs={'style':True})
138
+ for element in elements_with_style:
139
+ style_attr=element.get('style','')
140
+ import re
141
+ bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
142
+ matches=re.findall(bg_pattern,style_attr, re.IGNORECASE)
143
+ for match in matches:
144
+ if match.startswith('//'):
145
+ absolute_bg='https:'+match
146
+ style_attr=style_attr.replace(match,absolute_bg)
147
+ elif match.startswith('/'):
148
+ absolute_bg=urljoin(url,match)
149
+ style_attr=style_attr.replace(match,absolute_bg)
150
+ elif not match.startswith(('http://','https://')):
151
+ absolute_bg=urljoin(url,match)
152
+ style_attr=style_attr.replace(match,absolute_bg)
153
+ element['style']=style_attr
154
+ style_elements=soup.find_all('style')
155
+ for style in style_elements:
156
+ if style.string:
157
+ style_content=style.string
158
+ bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
159
+ matches=re.findall(bg_pattern,style_content, re.IGNORECASE)
160
+ for match in matches:
161
+ if match.startswith('//'):
162
+ absolute_bg='https:'+match
163
+ style_content=style_content.replace(match,absolute_bg)
164
+ elif match.startswith('/'):
165
+ absolute_bg=urljoin(url,match)
166
+ style_content=style_content.replace(match,absolute_bg)
167
+ elif not match.startswith(('http://','https://')):
168
+ absolute_bg=urljoin(url,match)
169
+ style_content=style_content.replace(match,absolute_bg)
170
+ style.string=style_content
171
+ images=[]
172
+ img_elements=soup.find_all('img')
173
+ for img in img_elements:
174
+ src=img.get('src','')
175
+ alt=img.get('alt','')
176
+ if src:
177
+ images.append({'src':src,'alt':alt})
178
+ def test_image_url(img_url):
179
+ try:
180
+ test_response=requests.head(img_url,timeout=5,allow_redirects=True)
181
+ return test_response.status_code==200
182
+ except:
183
+ return False
184
+ working_images=[]
185
+ for img in images[:10]:
186
+ if test_image_url(img['src']):
187
+ working_images.append(img)
188
+ modified_html=str(soup)
189
+ import re
190
+ cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL)
191
+ cleaned_html=re.sub(r'\s+',' ',cleaned_html)
192
+ cleaned_html=re.sub(r'>\s+<','><',cleaned_html)
193
+ if len(cleaned_html)>15000:
194
+ cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->"
195
+ if not title_text or title_text=="No title found":
196
+ title_text=url.split('/')[-1] or url.split('/')[-2] or "Website"
197
+ if len(cleaned_html.strip())<100:
198
+ website_content=f"""
199
+ WEBSITE REDESIGN - EXTRACTION FAILED
200
+ ====================================
201
+ URL: {url}
202
+ Title: {title_text}
203
+ ERROR: Could not extract meaningful HTML content from this website. This could be due to:
204
+ 1. The website uses heavy JavaScript to load content dynamically
205
+ 2. The website has anti-bot protection
206
+ 3. The website requires authentication
207
+ 4. The website is using advanced compression or encoding
208
+ FALLBACK APPROACH:
209
+ Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
210
+ 1. Create a typical layout for this type of website
211
+ 2. Use placeholder content that would be appropriate
212
+ 3. Include modern design elements and responsive features
213
+ 4. Use a clean, professional design with good typography
214
+ 5. Make it mobile-friendly and accessible
215
+ This will help me create a better design for you."""
216
+ return website_content.strip()
217
+ website_content=f"""
218
+ WEBSITE REDESIGN - ORIGINAL HTML CODE
219
+ ===[TRUNCATED FOR BREVITY]==="""
220
+ return website_content.strip()
221
+ except requests.exceptions.HTTPError as e:
222
+ if e.response.status_code==403:
223
+ return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
224
+ elif e.response.status_code==404:
225
+ return f"Error: Website not found (404). Please check the URL and try again."
226
+ elif e.response.status_code>=500:
227
+ return f"Error: Website server error ({e.response.status_code}). Please try again later."
228
+ else:
229
+ return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
230
+ except requests.exceptions.Timeout:
231
+ return "Error: Request timed out. The website may be slow or unavailable."
232
+ except requests.exceptions.ConnectionError:
233
+ return "Error: Could not connect to the website. Please check your internet connection and the URL."
234
+ except requests.exceptions.RequestException as e:
235
+ return f"Error accessing website: {str(e)}"
236
+ except Exception as e:
237
+ return f"Error extracting website content: {str(e)}"