File size: 6,507 Bytes
2045140 f84b0ed cfbd391 119dc78 cfbd391 a83f9d8 cfbd391 f84b0ed 75e8da0 f84b0ed cfbd391 f84b0ed 76ad1e1 f57eb57 f84b0ed 75e8da0 f84b0ed 0d4e1b9 f84b0ed f57eb57 cfbd391 f57eb57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import os
import tempfile
import streamlit as st
import pandas as pd
from datasets import load_dataset
import time
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_openai import ChatOpenAI
import ast
# Streamlit App Title and Description
st.title("Patent Data Analysis with LangChain")
st.write("""This app allows you to analyze patent-related datasets interactively using LangChain agents. You can upload datasets, load from Hugging Face, or use a repository directory dataset.""")
# Dataset loading without caching to support progress bar
def load_huggingface_dataset(dataset_name):
# Initialize progress bar
progress_bar = st.progress(0)
try:
# Incrementally update progress
progress_bar.progress(10)
dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True, uniform_split=True)
progress_bar.progress(50)
if hasattr(dataset, "to_pandas"):
df = dataset.to_pandas()
else:
df = pd.DataFrame(dataset)
progress_bar.progress(100) # Final update to 100%
return df
except Exception as e:
progress_bar.progress(0) # Reset progress bar on failure
raise e
def load_uploaded_csv(uploaded_file):
# Initialize progress bar
progress_bar = st.progress(0)
try:
# Simulate progress
progress_bar.progress(10)
time.sleep(1) # Simulate file processing delay
progress_bar.progress(50)
df = pd.read_csv(uploaded_file)
progress_bar.progress(100) # Final update
return df
except Exception as e:
progress_bar.progress(0) # Reset progress bar on failure
raise e
# Dataset selection logic
def load_dataset_into_session():
input_option = st.radio(
"Select Dataset Input:",
["Use Repo Directory Dataset", "Use Hugging Face Dataset", "Upload CSV File"], index=1, horizontal=True
)
# Option 1: Load dataset from the repo directory
if input_option == "Use Repo Directory Dataset":
file_path = "./source/test.csv"
if st.button("Load Dataset"):
try:
with st.spinner("Loading dataset from the repo directory..."):
st.session_state.df = pd.read_csv(file_path)
st.success(f"File loaded successfully from '{file_path}'!")
except Exception as e:
st.error(f"Error loading dataset from the repo directory: {e}")
# Option 2: Load dataset from Hugging Face
elif input_option == "Use Hugging Face Dataset":
dataset_name = st.text_input(
"Enter Hugging Face Dataset Name:", value="HUPD/hupd"
)
if st.button("Load Dataset"):
try:
st.session_state.df = load_huggingface_dataset(dataset_name)
st.success(f"Hugging Face Dataset '{dataset_name}' loaded successfully!")
except Exception as e:
st.error(f"Error loading Hugging Face dataset: {e}")
# Option 3: Upload CSV File
elif input_option == "Upload CSV File":
uploaded_file = st.file_uploader("Upload a CSV File:", type=["csv"])
if uploaded_file:
try:
st.session_state.df = load_uploaded_csv(uploaded_file)
st.success("File uploaded successfully!")
except Exception as e:
st.error(f"Error reading uploaded file: {e}")
# Load dataset into session
load_dataset_into_session()
if "df" in st.session_state:
df = st.session_state.df
# Display dataset metadata
st.write("### Dataset Metadata")
st.text(f"Number of Rows: {df.shape[0]}")
st.text(f"Number of Columns: {df.shape[1]}")
st.text(f"Column Names: {', '.join(df.columns)}")
# Display dataset preview
st.write("### Dataset Preview")
num_rows = st.slider("Select number of rows to display:", min_value=5, max_value=50, value=10)
st.dataframe(df.head(num_rows))
# Define LangChain CSV Agent
st.header("Run Queries on Patent Data")
with st.spinner("Setting up LangChain CSV Agent..."):
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
df.to_csv(temp_file.name, index=False)
csv_agent = create_csv_agent(
ChatOpenAI(temperature=0, model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")),
path=[temp_file.name],
verbose=True,
agent_type=AgentType.OPENAI_FUNCTIONS,
allow_dangerous_code=True
)
# Query Input and Execution
query = st.text_area("Enter your natural language query:", "How many patents are related to AI?")
if st.button("Run Query"):
with st.spinner("Running query..."):
try:
# Token limit configuration
max_rows = 200 # Adjust chunk size dynamically
total_rows = len(df)
if total_rows > max_rows:
results = []
for start in range(0, total_rows, max_rows):
chunk = df.iloc[start:start + max_rows]
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as chunk_file:
chunk.to_csv(chunk_file.name, index=False)
# Update the agent dynamically with the chunk
csv_agent = create_csv_agent(
ChatOpenAI(temperature=0, model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")),
path=[chunk_file.name],
verbose=False,
agent_type=AgentType.OPENAI_FUNCTIONS,
allow_dangerous_code=True
)
result = csv_agent.invoke(query)
results.append(result)
st.success("Query executed successfully!")
st.write("### Combined Query Results:")
st.write("\n".join(results))
else:
result = csv_agent.invoke(query)
st.success("Query executed successfully!")
st.write("### Query Result:")
st.write(result)
except Exception as e:
st.error(f"Error executing query: {e}")
|