Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -16,17 +16,21 @@ from huggingface_hub import InferenceClient
|
|
16 |
import inspect
|
17 |
import logging
|
18 |
|
|
|
19 |
# Set up basic configuration for logging
|
20 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
21 |
|
22 |
# Environment variables and configurations
|
23 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
24 |
llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
|
|
|
|
|
25 |
|
26 |
MODELS = [
|
27 |
"mistralai/Mistral-7B-Instruct-v0.3",
|
28 |
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
29 |
-
"meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
|
30 |
]
|
31 |
|
32 |
# Initialize LlamaParse
|
@@ -81,33 +85,81 @@ def update_vectors(files, parser):
|
|
81 |
|
82 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
def generate_chunked_response(prompt, model, max_tokens=1000, num_calls=3, temperature=0.2, should_stop=False):
|
85 |
print(f"Starting generate_chunked_response with {num_calls} calls")
|
86 |
-
client = InferenceClient(model, token=huggingface_token)
|
87 |
full_response = ""
|
88 |
messages = [{"role": "user", "content": prompt}]
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
print("
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
# Clean up the response
|
113 |
clean_response = re.sub(r'<s>\[INST\].*?\[/INST\]\s*', '', full_response, flags=re.DOTALL)
|
|
|
16 |
import inspect
|
17 |
import logging
|
18 |
|
19 |
+
|
20 |
# Set up basic configuration for logging
|
21 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
22 |
|
23 |
# Environment variables and configurations
|
24 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
25 |
llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
|
26 |
+
ACCOUNT_ID = os.environ.get("CLOUDFARE_ACCOUNT_ID")
|
27 |
+
AUTH_TOKEN = os.environ.get("CLOUDFLARE_AUTH_TOKEN")
|
28 |
|
29 |
MODELS = [
|
30 |
"mistralai/Mistral-7B-Instruct-v0.3",
|
31 |
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
32 |
+
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
33 |
+
"@cf/meta/llama-3.1-8b-instruct"
|
34 |
]
|
35 |
|
36 |
# Initialize LlamaParse
|
|
|
85 |
|
86 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
|
87 |
|
88 |
+
import os
|
89 |
+
import json
|
90 |
+
import requests
|
91 |
+
from huggingface_hub import InferenceClient
|
92 |
+
|
93 |
+
ACCOUNT_ID = "your-account-id" # Replace with your actual Cloudflare account ID
|
94 |
+
AUTH_TOKEN = os.environ.get("CLOUDFLARE_AUTH_TOKEN")
|
95 |
+
|
96 |
def generate_chunked_response(prompt, model, max_tokens=1000, num_calls=3, temperature=0.2, should_stop=False):
|
97 |
print(f"Starting generate_chunked_response with {num_calls} calls")
|
|
|
98 |
full_response = ""
|
99 |
messages = [{"role": "user", "content": prompt}]
|
100 |
|
101 |
+
if model == "@cf/meta/llama-3.1-8b-instruct":
|
102 |
+
# Cloudflare API
|
103 |
+
for i in range(num_calls):
|
104 |
+
print(f"Starting Cloudflare API call {i+1}")
|
105 |
+
if should_stop:
|
106 |
+
print("Stop clicked, breaking loop")
|
107 |
+
break
|
108 |
+
try:
|
109 |
+
response = requests.post(
|
110 |
+
f"https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/@cf/meta/llama-3.1-8b-instruct",
|
111 |
+
headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
|
112 |
+
json={
|
113 |
+
"stream": True,
|
114 |
+
"messages": [
|
115 |
+
{"role": "system", "content": "You are a friendly assistant"},
|
116 |
+
{"role": "user", "content": prompt}
|
117 |
+
],
|
118 |
+
"max_tokens": max_tokens,
|
119 |
+
"temperature": temperature
|
120 |
+
},
|
121 |
+
stream=True
|
122 |
+
)
|
123 |
+
|
124 |
+
for line in response.iter_lines():
|
125 |
+
if should_stop:
|
126 |
+
print("Stop clicked during streaming, breaking")
|
127 |
+
break
|
128 |
+
if line:
|
129 |
+
try:
|
130 |
+
json_data = json.loads(line.decode('utf-8').split('data: ')[1])
|
131 |
+
chunk = json_data['response']
|
132 |
+
full_response += chunk
|
133 |
+
except json.JSONDecodeError:
|
134 |
+
continue
|
135 |
+
print(f"Cloudflare API call {i+1} completed")
|
136 |
+
except Exception as e:
|
137 |
+
print(f"Error in generating response from Cloudflare: {str(e)}")
|
138 |
+
else:
|
139 |
+
# Original Hugging Face API logic
|
140 |
+
client = InferenceClient(model, token=huggingface_token)
|
141 |
+
|
142 |
+
for i in range(num_calls):
|
143 |
+
print(f"Starting Hugging Face API call {i+1}")
|
144 |
+
if should_stop:
|
145 |
+
print("Stop clicked, breaking loop")
|
146 |
+
break
|
147 |
+
try:
|
148 |
+
for message in client.chat_completion(
|
149 |
+
messages=messages,
|
150 |
+
max_tokens=max_tokens,
|
151 |
+
temperature=temperature,
|
152 |
+
stream=True,
|
153 |
+
):
|
154 |
+
if should_stop:
|
155 |
+
print("Stop clicked during streaming, breaking")
|
156 |
+
break
|
157 |
+
if message.choices and message.choices[0].delta and message.choices[0].delta.content:
|
158 |
+
chunk = message.choices[0].delta.content
|
159 |
+
full_response += chunk
|
160 |
+
print(f"Hugging Face API call {i+1} completed")
|
161 |
+
except Exception as e:
|
162 |
+
print(f"Error in generating response from Hugging Face: {str(e)}")
|
163 |
|
164 |
# Clean up the response
|
165 |
clean_response = re.sub(r'<s>\[INST\].*?\[/INST\]\s*', '', full_response, flags=re.DOTALL)
|