import gradio as gr from huggingface_hub import InferenceClient from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import TextLoader # Initialize the Hugging Face Inference client with an open-source LLM client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # You can use any supported model # Sample knowledge base for Crustdata APIs docs = """ # Crustdata Dataset API ## Description The Crustdata Dataset API provides access to a wide variety of datasets across different domains. It allows users to search, filter, and retrieve datasets based on categories, tags, and other metadata. ## Key Endpoints ### 1. **GET /datasets** - **Description**: Retrieves a list of available datasets. - **Parameters**: - `category` (optional): Filter datasets by a specific category. - `tags` (optional): Filter datasets by tags (comma-separated). - `limit` (optional): Maximum number of datasets to return (default: 10). - **Example Request**: ```bash curl -X GET "https://api.crustdata.com/datasets?category=finance&tags=economy,stocks&limit=5" ``` - **Example Response**: ```json { "datasets": [ { "id": "12345", "name": "Global Finance Dataset", "category": "finance", "tags": ["economy", "stocks"] }, ... ] } ``` ### 2. **GET /datasets/{id}** - **Description**: Retrieves detailed information about a specific dataset. - **Parameters**: - `id` (required): The unique identifier of the dataset. - **Example Request**: ```bash curl -X GET "https://api.crustdata.com/datasets/12345" ``` - **Example Response**: ```json { "id": "12345", "name": "Global Finance Dataset", "description": "A comprehensive dataset on global financial markets.", "category": "finance", "tags": ["economy", "stocks"], "source": "World Bank" } ``` --- # Crustdata Discovery and Enrichment API ## Description The Crustdata Discovery and Enrichment API allows users to enrich their datasets by adding metadata, geolocation information, and other relevant attributes. ## Key Endpoints ### 1. **POST /enrich** - **Description**: Enriches input data with additional metadata based on the specified enrichment type. - **Parameters**: - `input_data` (required): A list of data entries to be enriched. - `enrichment_type` (required): The type of enrichment to apply. Supported types: - `geolocation` - `demographics` - **Example Request**: ```bash curl -X POST "https://api.crustdata.com/enrich" \ -H "Content-Type: application/json" \ -d '{ "input_data": [{"address": "123 Main St, Springfield"}], "enrichment_type": "geolocation" }' ``` - **Example Response**: ```json { "enriched_data": [ { "address": "123 Main St, Springfield", "latitude": 37.12345, "longitude": -93.12345 } ] } ``` ### 2. **POST /search** - **Description**: Searches for relevant metadata or datasets based on user-provided criteria. - **Parameters**: - `query` (required): The search term or query string. - `filters` (optional): Additional filters to narrow down the search results. - **Example Request**: ```bash curl -X POST "https://api.crustdata.com/search" \ -H "Content-Type: application/json" \ -d '{ "query": "energy consumption", "filters": {"category": "energy"} }' ``` - **Example Response**: ```json { "results": [ { "id": "67890", "name": "Energy Consumption Dataset", "category": "energy", "tags": ["consumption", "renewables"] } ] } ``` --- # General Notes - All endpoints require authentication using an API key. - API requests must include the `Authorization` header: ```plaintext Authorization: Bearer YOUR_API_KEY ``` - Response format: JSON - Base URL: `https://api.crustdata.com` """ # Split the documentation into smaller chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) doc_chunks = text_splitter.create_documents([docs]) # Create embeddings and initialize FAISS vector store embedding_model = "sentence-transformers/all-MiniLM-L6-v2" embeddings = HuggingFaceEmbeddings(model_name=embedding_model) docsearch = FAISS.from_documents(doc_chunks, embeddings) def retrieve_context(query): """Retrieve the most relevant context from the knowledge base.""" results = docsearch.similarity_search(query, k=2) # Retrieve top 2 most similar chunks context = "\n".join([res.page_content for res in results]) return context def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): """Generate a response using the Hugging Face Inference API.""" # Retrieve relevant context from the knowledge base context = retrieve_context(message) prompt = f"{system_message}\n\nContext:\n{context}\n\nUser: {message}\nAssistant:" messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": prompt}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response # Gradio interface demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a technical assistant for Crustdata APIs.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), ], title="Crustdata API Chatbot", description="Ask any technical questions about Crustdata’s Dataset and Discovery APIs.", ) if __name__ == "__main__": demo.launch(share=True)