Crustdata_Build_Challenge

Sleeping

App Files Files Community

Crustdata_Build_Challenge / app.py

wifix199

Update app.py

036b96f verified 2 months ago

raw

history blame contribute delete

6.38 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	from langchain.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import TextLoader

	# Initialize the Hugging Face Inference client with an open-source LLM
	client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # You can use any supported model

	# Sample knowledge base for Crustdata APIs
	docs = """
	# Crustdata Dataset API

	## Description
	The Crustdata Dataset API provides access to a wide variety of datasets across different domains. It allows users to search, filter, and retrieve datasets based on categories, tags, and other metadata.

	## Key Endpoints

	### 1. GET /datasets
	- Description: Retrieves a list of available datasets.
	- Parameters:
	- `category` (optional): Filter datasets by a specific category.
	- `tags` (optional): Filter datasets by tags (comma-separated).
	- `limit` (optional): Maximum number of datasets to return (default: 10).

	- Example Request:
	```bash
	curl -X GET "https://api.crustdata.com/datasets?category=finance&tags=economy,stocks&limit=5"
	```

	- Example Response:
	```json
	{
	"datasets": [
	{
	"id": "12345",
	"name": "Global Finance Dataset",
	"category": "finance",
	"tags": ["economy", "stocks"]
	},
	...
	]
	}
	```

	### 2. GET /datasets/{id}
	- Description: Retrieves detailed information about a specific dataset.
	- Parameters:
	- `id` (required): The unique identifier of the dataset.

	- Example Request:
	```bash
	curl -X GET "https://api.crustdata.com/datasets/12345"
	```

	- Example Response:
	```json
	{
	"id": "12345",
	"name": "Global Finance Dataset",
	"description": "A comprehensive dataset on global financial markets.",
	"category": "finance",
	"tags": ["economy", "stocks"],
	"source": "World Bank"
	}
	```

	---

	# Crustdata Discovery and Enrichment API

	## Description
	The Crustdata Discovery and Enrichment API allows users to enrich their datasets by adding metadata, geolocation information, and other relevant attributes.

	## Key Endpoints

	### 1. POST /enrich
	- Description: Enriches input data with additional metadata based on the specified enrichment type.
	- Parameters:
	- `input_data` (required): A list of data entries to be enriched.
	- `enrichment_type` (required): The type of enrichment to apply. Supported types:
	- `geolocation`
	- `demographics`

	- Example Request:
	```bash
	curl -X POST "https://api.crustdata.com/enrich" \
	-H "Content-Type: application/json" \
	-d '{
	"input_data": [{"address": "123 Main St, Springfield"}],
	"enrichment_type": "geolocation"
	}'
	```

	- Example Response:
	```json
	{
	"enriched_data": [
	{
	"address": "123 Main St, Springfield",
	"latitude": 37.12345,
	"longitude": -93.12345
	}
	]
	}
	```

	### 2. POST /search
	- Description: Searches for relevant metadata or datasets based on user-provided criteria.
	- Parameters:
	- `query` (required): The search term or query string.
	- `filters` (optional): Additional filters to narrow down the search results.

	- Example Request:
	```bash
	curl -X POST "https://api.crustdata.com/search" \
	-H "Content-Type: application/json" \
	-d '{
	"query": "energy consumption",
	"filters": {"category": "energy"}
	}'
	```

	- Example Response:
	```json
	{
	"results": [
	{
	"id": "67890",
	"name": "Energy Consumption Dataset",
	"category": "energy",
	"tags": ["consumption", "renewables"]
	}
	]
	}
	```

	---

	# General Notes
	- All endpoints require authentication using an API key.
	- API requests must include the `Authorization` header:
	```plaintext
	Authorization: Bearer YOUR_API_KEY
	```
	- Response format: JSON
	- Base URL: `https://api.crustdata.com`
	"""

	# Split the documentation into smaller chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	doc_chunks = text_splitter.create_documents([docs])

	# Create embeddings and initialize FAISS vector store
	embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
	docsearch = FAISS.from_documents(doc_chunks, embeddings)


	def retrieve_context(query):
	"""Retrieve the most relevant context from the knowledge base."""
	results = docsearch.similarity_search(query, k=2) # Retrieve top 2 most similar chunks
	context = "\n".join([res.page_content for res in results])
	return context


	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	"""Generate a response using the Hugging Face Inference API."""
	# Retrieve relevant context from the knowledge base
	context = retrieve_context(message)
	prompt = f"{system_message}\n\nContext:\n{context}\n\nUser: {message}\nAssistant:"

	messages = [{"role": "system", "content": system_message}]
	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})

	messages.append({"role": "user", "content": prompt})

	response = ""

	for message in client.chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	token = message.choices[0].delta.content
	response += token
	yield response


	# Gradio interface
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a technical assistant for Crustdata APIs.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	],
	title="Crustdata API Chatbot",
	description="Ask any technical questions about Crustdata’s Dataset and Discovery APIs.",
	)

	if __name__ == "__main__":
	demo.launch(share=True)