Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
from langchain.chains import RetrievalQA | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.document_loaders import TextLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# Load a Hugging Face model for Q&A | |
model_name = "EleutherAI/gpt-neox-20b" # You can choose a lighter model if needed | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=512) | |
# Knowledge base for Crustdata APIs | |
# Knowledge base for Crustdata APIs | |
docs = """ | |
# Crustdata Dataset API | |
## Description | |
The Crustdata Dataset API provides access to a wide variety of datasets across different domains. It allows users to search, filter, and retrieve datasets based on categories, tags, and other metadata. | |
## Key Endpoints | |
### 1. **GET /datasets** | |
- **Description**: Retrieves a list of available datasets. | |
- **Parameters**: | |
- `category` (optional): Filter datasets by a specific category. | |
- `tags` (optional): Filter datasets by tags (comma-separated). | |
- `limit` (optional): Maximum number of datasets to return (default: 10). | |
- **Example Request**: | |
```bash | |
curl -X GET "https://api.crustdata.com/datasets?category=finance&tags=economy,stocks&limit=5" | |
``` | |
- **Example Response**: | |
```json | |
{ | |
"datasets": [ | |
{ | |
"id": "12345", | |
"name": "Global Finance Dataset", | |
"category": "finance", | |
"tags": ["economy", "stocks"] | |
}, | |
... | |
] | |
} | |
``` | |
### 2. **GET /datasets/{id}** | |
- **Description**: Retrieves detailed information about a specific dataset. | |
- **Parameters**: | |
- `id` (required): The unique identifier of the dataset. | |
- **Example Request**: | |
```bash | |
curl -X GET "https://api.crustdata.com/datasets/12345" | |
``` | |
- **Example Response**: | |
```json | |
{ | |
"id": "12345", | |
"name": "Global Finance Dataset", | |
"description": "A comprehensive dataset on global financial markets.", | |
"category": "finance", | |
"tags": ["economy", "stocks"], | |
"source": "World Bank" | |
} | |
``` | |
--- | |
# Crustdata Discovery and Enrichment API | |
## Description | |
The Crustdata Discovery and Enrichment API allows users to enrich their datasets by adding metadata, geolocation information, and other relevant attributes. | |
## Key Endpoints | |
### 1. **POST /enrich** | |
- **Description**: Enriches input data with additional metadata based on the specified enrichment type. | |
- **Parameters**: | |
- `input_data` (required): A list of data entries to be enriched. | |
- `enrichment_type` (required): The type of enrichment to apply. Supported types: | |
- `geolocation` | |
- `demographics` | |
- **Example Request**: | |
```bash | |
curl -X POST "https://api.crustdata.com/enrich" \ | |
-H "Content-Type: application/json" \ | |
-d '{ | |
"input_data": [{"address": "123 Main St, Springfield"}], | |
"enrichment_type": "geolocation" | |
}' | |
``` | |
- **Example Response**: | |
```json | |
{ | |
"enriched_data": [ | |
{ | |
"address": "123 Main St, Springfield", | |
"latitude": 37.12345, | |
"longitude": -93.12345 | |
} | |
] | |
} | |
``` | |
### 2. **POST /search** | |
- **Description**: Searches for relevant metadata or datasets based on user-provided criteria. | |
- **Parameters**: | |
- `query` (required): The search term or query string. | |
- `filters` (optional): Additional filters to narrow down the search results. | |
- **Example Request**: | |
```bash | |
curl -X POST "https://api.crustdata.com/search" \ | |
-H "Content-Type: application/json" \ | |
-d '{ | |
"query": "energy consumption", | |
"filters": {"category": "energy"} | |
}' | |
``` | |
- **Example Response**: | |
```json | |
{ | |
"results": [ | |
{ | |
"id": "67890", | |
"name": "Energy Consumption Dataset", | |
"category": "energy", | |
"tags": ["consumption", "renewables"] | |
} | |
] | |
} | |
``` | |
--- | |
# General Notes | |
- All endpoints require authentication using an API key. | |
- API requests must include the `Authorization` header: | |
```plaintext | |
Authorization: Bearer YOUR_API_KEY | |
``` | |
- Response format: JSON | |
- Base URL: `https://api.crustdata.com` | |
""" | |
# Split the documentation into chunks for embedding | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
doc_chunks = text_splitter.create_documents([docs]) | |
# Embed the documents using sentence-transformers | |
embedding_model = "sentence-transformers/all-MiniLM-L6-v2" | |
embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
docsearch = FAISS.from_documents(doc_chunks, embeddings) | |
# Create a QA chain | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=qa_pipeline, | |
retriever=docsearch.as_retriever(), | |
return_source_documents=True | |
) | |
# Function to handle user queries | |
def answer_question(question): | |
result = qa_chain.run(question) | |
return result | |
# Create a Gradio interface | |
chat_interface = gr.Interface( | |
fn=answer_question, | |
inputs=gr.Textbox(lines=2, placeholder="Ask a question about Crustdata APIs..."), | |
outputs="text", | |
title="Crustdata API Chat", | |
description="Ask any technical questions about Crustdata’s Dataset and Discovery APIs.", | |
) | |
# Launch the Gradio app | |
chat_interface.launch(share=True) | |