Shreyas094 commited on
Commit
8f4e927
·
verified ·
1 Parent(s): 664e897

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -100
app.py CHANGED
@@ -1,10 +1,7 @@
1
- import random
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import torch
 
6
  from huggingface_hub import login
7
- import os
8
 
9
  # Directly assign your Hugging Face token here
10
  hf_token = "your_hugging_face_api_token"
@@ -12,94 +9,6 @@ hf_token = "your_hugging_face_api_token"
12
  # Log in to Hugging Face
13
  login(token=hf_token)
14
 
15
- # List of user agents
16
- _useragent_list = [
17
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
18
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
19
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
20
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
21
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
22
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
23
- ]
24
-
25
- # Function to extract visible text from HTML content of a webpage
26
- def extract_text_from_webpage(html):
27
- print("Extracting text from webpage...")
28
- soup = BeautifulSoup(html, 'html.parser')
29
- for script in soup(["script", "style"]):
30
- script.extract() # Remove scripts and styles
31
- text = soup.get_text()
32
- lines = (line.strip() for line in text.splitlines())
33
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
34
- text = '\n'.join(chunk for chunk in chunks if chunk)
35
- print(f"Extracted text length: {len(text)}")
36
- return text
37
-
38
- # Function to perform a Google search and retrieve results
39
- def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
40
- """Performs a Google search and returns the results."""
41
- print(f"Searching for term: {term}")
42
- escaped_term = requests.utils.quote(term)
43
- start = 0
44
- all_results = []
45
- max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
46
-
47
- with requests.Session() as session:
48
- while start < num_results:
49
- print(f"Fetching search results starting from: {start}")
50
- try:
51
- # Choose a random user agent
52
- user_agent = random.choice(_useragent_list)
53
- headers = {
54
- 'User-Agent': user_agent
55
- }
56
- print(f"Using User-Agent: {headers['User-Agent']}")
57
-
58
- resp = session.get(
59
- url="https://www.google.com/search",
60
- headers=headers,
61
- params={
62
- "q": term,
63
- "num": num_results - start,
64
- "hl": lang,
65
- "start": start,
66
- "safe": safe,
67
- },
68
- timeout=timeout,
69
- verify=ssl_verify,
70
- )
71
- resp.raise_for_status()
72
- except requests.exceptions.RequestException as e:
73
- print(f"Error fetching search results: {e}")
74
- break
75
-
76
- soup = BeautifulSoup(resp.text, "html.parser")
77
- result_block = soup.find_all("div", attrs={"class": "g"})
78
- if not result_block:
79
- print("No more results found.")
80
- break
81
- for result in result_block:
82
- link = result.find("a", href=True)
83
- if link:
84
- link = link["href"]
85
- print(f"Found link: {link}")
86
- try:
87
- webpage = session.get(link, headers=headers, timeout=timeout)
88
- webpage.raise_for_status()
89
- visible_text = extract_text_from_webpage(webpage.text)
90
- if len(visible_text) > max_chars_per_page:
91
- visible_text = visible_text[:max_chars_per_page] + "..."
92
- all_results.append({"link": link, "text": visible_text})
93
- except requests.exceptions.RequestException as e:
94
- print(f"Error fetching or processing {link}: {e}")
95
- all_results.append({"link": link, "text": None})
96
- else:
97
- print("No link found in result.")
98
- all_results.append({"link": None, "text": None})
99
- start += len(result_block)
100
- print(f"Total results fetched: {len(all_results)}")
101
- return all_results
102
-
103
  # Load the Mixtral-8x7B-Instruct model and tokenizer with authorization header
104
  model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
105
  headers = {"Authorization": f"Bearer {hf_token}"}
@@ -117,15 +26,11 @@ model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token
117
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
118
  model.to(device)
119
 
120
- # Example usage
121
- search_term = "How did Tesla perform in Q1 2024"
122
- search_results = google_search(search_term, num_results=3)
123
-
124
- # Combine text from search results to create a prompt
125
- combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])
126
 
127
  # Tokenize the input text
128
- inputs = tokenizer(combined_text, return_tensors="pt").to(device)
129
 
130
  # Generate a response
131
  outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)
 
1
+ import os
 
 
 
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from huggingface_hub import login
 
5
 
6
  # Directly assign your Hugging Face token here
7
  hf_token = "your_hugging_face_api_token"
 
9
  # Log in to Hugging Face
10
  login(token=hf_token)
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Load the Mixtral-8x7B-Instruct model and tokenizer with authorization header
13
  model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
14
  headers = {"Authorization": f"Bearer {hf_token}"}
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  model.to(device)
28
 
29
+ # Example text input
30
+ text_input = "How did Tesla perform in Q1 2024?"
 
 
 
 
31
 
32
  # Tokenize the input text
33
+ inputs = tokenizer(text_input, return_tensors="pt").to(device)
34
 
35
  # Generate a response
36
  outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)