|
import os |
|
import tempfile |
|
import streamlit as st |
|
import pandas as pd |
|
from datasets import load_dataset |
|
import time |
|
from langchain.agents.agent_types import AgentType |
|
from langchain_experimental.agents.agent_toolkits import create_csv_agent |
|
from langchain_openai import ChatOpenAI |
|
import ast |
|
|
|
|
|
st.title("Patent Data Analysis with LangChain") |
|
st.write("""This app allows you to analyze patent-related datasets interactively using LangChain agents. You can upload datasets, load from Hugging Face, or use a repository directory dataset.""") |
|
|
|
|
|
def load_huggingface_dataset(dataset_name): |
|
|
|
progress_bar = st.progress(0) |
|
try: |
|
|
|
progress_bar.progress(10) |
|
dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True, uniform_split=True) |
|
progress_bar.progress(50) |
|
if hasattr(dataset, "to_pandas"): |
|
df = dataset.to_pandas() |
|
else: |
|
df = pd.DataFrame(dataset) |
|
progress_bar.progress(100) |
|
return df |
|
except Exception as e: |
|
progress_bar.progress(0) |
|
raise e |
|
|
|
def load_uploaded_csv(uploaded_file): |
|
|
|
progress_bar = st.progress(0) |
|
try: |
|
|
|
progress_bar.progress(10) |
|
time.sleep(1) |
|
progress_bar.progress(50) |
|
df = pd.read_csv(uploaded_file) |
|
progress_bar.progress(100) |
|
return df |
|
except Exception as e: |
|
progress_bar.progress(0) |
|
raise e |
|
|
|
|
|
def load_dataset_into_session(): |
|
input_option = st.radio( |
|
"Select Dataset Input:", |
|
["Use Repo Directory Dataset", "Use Hugging Face Dataset", "Upload CSV File"], index=1, horizontal=True |
|
) |
|
|
|
|
|
if input_option == "Use Repo Directory Dataset": |
|
file_path = "./source/test.csv" |
|
if st.button("Load Dataset"): |
|
try: |
|
with st.spinner("Loading dataset from the repo directory..."): |
|
st.session_state.df = pd.read_csv(file_path) |
|
st.success(f"File loaded successfully from '{file_path}'!") |
|
except Exception as e: |
|
st.error(f"Error loading dataset from the repo directory: {e}") |
|
|
|
|
|
elif input_option == "Use Hugging Face Dataset": |
|
dataset_name = st.text_input( |
|
"Enter Hugging Face Dataset Name:", value="HUPD/hupd" |
|
) |
|
if st.button("Load Dataset"): |
|
try: |
|
st.session_state.df = load_huggingface_dataset(dataset_name) |
|
st.success(f"Hugging Face Dataset '{dataset_name}' loaded successfully!") |
|
except Exception as e: |
|
st.error(f"Error loading Hugging Face dataset: {e}") |
|
|
|
|
|
elif input_option == "Upload CSV File": |
|
uploaded_file = st.file_uploader("Upload a CSV File:", type=["csv"]) |
|
if uploaded_file: |
|
try: |
|
st.session_state.df = load_uploaded_csv(uploaded_file) |
|
st.success("File uploaded successfully!") |
|
except Exception as e: |
|
st.error(f"Error reading uploaded file: {e}") |
|
|
|
|
|
load_dataset_into_session() |
|
|
|
if "df" in st.session_state: |
|
df = st.session_state.df |
|
|
|
|
|
st.write("### Dataset Metadata") |
|
st.text(f"Number of Rows: {df.shape[0]}") |
|
st.text(f"Number of Columns: {df.shape[1]}") |
|
st.text(f"Column Names: {', '.join(df.columns)}") |
|
|
|
|
|
st.write("### Dataset Preview") |
|
num_rows = st.slider("Select number of rows to display:", min_value=5, max_value=50, value=10) |
|
st.dataframe(df.head(num_rows)) |
|
|
|
|
|
st.header("Run Queries on Patent Data") |
|
|
|
with st.spinner("Setting up LangChain CSV Agent..."): |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file: |
|
df.to_csv(temp_file.name, index=False) |
|
|
|
csv_agent = create_csv_agent( |
|
ChatOpenAI(temperature=0, model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")), |
|
path=[temp_file.name], |
|
verbose=True, |
|
agent_type=AgentType.OPENAI_FUNCTIONS, |
|
allow_dangerous_code=True |
|
) |
|
|
|
|
|
query = st.text_area("Enter your natural language query:", "How many patents are related to AI?") |
|
|
|
if st.button("Run Query"): |
|
with st.spinner("Running query..."): |
|
try: |
|
|
|
max_rows = 200 |
|
total_rows = len(df) |
|
|
|
if total_rows > max_rows: |
|
results = [] |
|
for start in range(0, total_rows, max_rows): |
|
chunk = df.iloc[start:start + max_rows] |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as chunk_file: |
|
chunk.to_csv(chunk_file.name, index=False) |
|
|
|
|
|
csv_agent = create_csv_agent( |
|
ChatOpenAI(temperature=0, model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")), |
|
path=[chunk_file.name], |
|
verbose=False, |
|
agent_type=AgentType.OPENAI_FUNCTIONS, |
|
allow_dangerous_code=True |
|
) |
|
result = csv_agent.invoke(query) |
|
results.append(result) |
|
|
|
st.success("Query executed successfully!") |
|
st.write("### Combined Query Results:") |
|
st.write("\n".join(results)) |
|
|
|
else: |
|
result = csv_agent.invoke(query) |
|
st.success("Query executed successfully!") |
|
st.write("### Query Result:") |
|
st.write(result) |
|
|
|
except Exception as e: |
|
st.error(f"Error executing query: {e}") |
|
|