Spaces:

tomascufaro
/

interview_explorer

Sleeping

App Files Files Community

tomascufaro commited on Jul 4, 2024

Commit

1fecdf1

1 Parent(s): 632d9b5

new app.py

Browse files

Files changed (2) hide show

app.py +135 -20
requirements.txt +223 -11

app.py CHANGED Viewed

@@ -8,14 +8,38 @@ from langchain import hub
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
 import os
 import gradio as gr
-def doc_to_embeddings(doc:Document, split_mode:str='tiktoken',
-                      chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None):
-    # Load the PDF file (if the file is a URL, load the PDF file from the URL)
     # Split by separator and merge by character count
     if split_mode == "character":
         # Create a CharacterTextSplitter object
@@ -42,40 +66,131 @@ def doc_to_embeddings(doc:Document, split_mode:str='tiktoken',
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,)
     else:
-        raise ValueError("Please specify the split mode.")
     documents = text_splitter.split_documents(doc)
     embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OpenAI_APIKEY'])
     faiss_db = FAISS.from_documents(documents, embeddings)
     if save_faiss:
         faiss_db.save_local(faiss_save_path)
     return faiss_db
-def format_docs(docs):
-    return "\n\n".join(doc.page_content for doc in docs)
-def wrap_all(file, input_prompt:str):
-    loader = Docx2txtLoader(file)
-    data = loader.load()
-    db = doc_to_embeddings(data)
-    retriever = db.as_retriever()
-    prompt = hub.pull("rlm/rag-prompt")
-    llm = ChatOpenAI(model_name="gpt-4",openai_api_key=os.environ['OpenAI_APIKEY'], temperature=0)
     rag_chain = (
-                {"context": retriever | format_docs, "question": RunnablePassthrough()}
-                | prompt
-                | llm
-                | StrOutputParser()
-                )
     return rag_chain.invoke(input_prompt)
 # Define the Gradio interface
 iface = gr.Interface(
     fn=wrap_all,
-    inputs=[gr.File(type="filepath", label=".docx file of the interview"), gr.Textbox(label="Enter your inquiry")],
     outputs="text",
     title="Interviews: QA and summarization",
     description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.")
 iface.launch()

 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
+from langchain_community.vectorstores import Chroma
 import os
 import gradio as gr
+import os
+from typing import List
+from pydantic import BaseModel
+from langchain_core.prompts import ChatPromptTemplate
+from unstructured.partition.pdf import partition_pdf
+import uuid
+from langchain.retrievers.multi_vector import MultiVectorRetriever
+from langchain.storage import InMemoryStore
+from langchain_community.document_loaders import UnstructuredPDFLoader
+# The vectorstore to use to index the child chunks
+vectorstore = Chroma(
+    collection_name="rag_app",embedding_function=OpenAIEmbeddings(api_key="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"))
+# The storage layer for the parent documents
+store = InMemoryStore()
+id_key = "doc_id"
+# The retriever (empty to start)
+retriever = MultiVectorRetriever(
+    vectorstore=vectorstore,
+    docstore=store,
+    id_key=id_key,
+)
+def split_text(doc:str, split_mode:str='tiktoken',
+                      chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None):
     # Split by separator and merge by character count
     if split_mode == "character":
         # Create a CharacterTextSplitter object
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,)
     else:
+        raise ValueError("Please specify the split mode.")
     documents = text_splitter.split_documents(doc)
+    return documents
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+class Element(BaseModel):
+    type: str
+    text: str
+def save_documents(Documents):
     embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OpenAI_APIKEY'])
     faiss_db = FAISS.from_documents(documents, embeddings)
     if save_faiss:
         faiss_db.save_local(faiss_save_path)
     return faiss_db
+def save_documents(texts, text_summaries, tables, table_summaries):
+    # Add texts
+    doc_ids = [str(uuid.uuid4()) for _ in texts]
+    summary_texts = [
+        Document(page_content=s, metadata={id_key: doc_ids[i]})
+        for i, s in enumerate(text_summaries)
+    ]
+    retriever.vectorstore.add_documents(summary_texts)
+    retriever.docstore.mset(list(zip(doc_ids, texts)))
+    # Add tables
+    table_ids = [str(uuid.uuid4()) for _ in tables]
+    summary_tables = [
+        Document(page_content=s, metadata={id_key: table_ids[i]})
+        for i, s in enumerate(table_summaries)
+    ]
+    retriever.vectorstore.add_documents(summary_tables)
+    retriever.docstore.mset(list(zip(table_ids, tables)))
+def doc_processing(files: List[bytes]):
+    docs = []
+    tables = []
+    for file in files:
+        if file.name.endswith(".pdf"):
+    # Identify file type and process accordingly
+            raw_pdf_elements = partition_pdf(
+            filename=file,
+            extract_images_in_pdf=False,
+            infer_table_structure=True,
+            chunking_strategy="by_title",
+            max_characters=4000,
+            new_after_n_chars=3800,
+            combine_text_under_n_chars=2000,
+            image_output_dir_path='/tmp',  # Change this to your desired path
+            )
+            categorized_elements = []
+            for element in raw_pdf_elements:
+                if "unstructured.documents.elements.Table" in str(type(element)):
+                    categorized_elements.append(Element(type="table", text=str(element)))
+                elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
+                    categorized_elements.append(Element(type="text", text=str(element)))
+            # Extract text and table elements
+            text_elements = [e for e in categorized_elements if e.type == "text"]
+            table_elements = [e for e in categorized_elements if e.type == "table"]
+            docs.extend(text_elements)
+            tables.extend(table_elements)
+        elif file.name.endswith(".docx"):
+            # Process DOCX file using LangChain Docx2txtLoader
+            loader = Docx2txtLoader(file)
+            data = loader.load()
+            docs.extend(data)
+    # Prompt
+    prompt_text = """You are an assistant tasked with summarizing tables and text.
+    Give a concise summary of the table or text. Table or text chunk: {element} """
+    prompt = ChatPromptTemplate.from_template(prompt_text)
+    # Summary chain
+    model = ChatOpenAI(temperature=0, model="gpt-4")
+    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
+    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
+    text_summaries = summarize_chain.batch(docs, {"max_concurrency": 5})
+    return docs, tables, text_summaries, table_summaries
+    # Convert the list of document texts to embeddings
+def wrap_all(files: List[bytes], input_prompt: str):
+    save_documents(doc_processing(files))
+        # Prompt template
+    template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
+                 If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. Please cite the text that you are using to base your arguments when it is possible.
+                Question: {question}
+                Context: {context}
+                Answer:
+                    """
+    prompt = ChatPromptTemplate.from_template(template)
+    # Load the prompt template and the language model
+    #prompt = hub.pull("rlm/rag-prompt")
+    llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.environ['OpenAI_APIKEY'], temperature=0)
+    # Create the RAG chain
     rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+    # Invoke the chain with the input prompt
     return rag_chain.invoke(input_prompt)
 # Define the Gradio interface
 iface = gr.Interface(
     fn=wrap_all,
+    inputs=[gr.File(type="filepath", label=".docx file of the interview", file_count='multiple'), gr.Textbox(label="Enter your inquiry")],
     outputs="text",
     title="Interviews: QA and summarization",
     description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.")
 iface.launch()

requirements.txt CHANGED Viewed

@@ -1,11 +1,223 @@
-langchain
-numpy
-pandas
-openai
-openpyxl
-langchain_community
-langchain_openai
-langchain_core
-docx2txt
-faiss-cpu
-langchainhub

+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+asgiref==3.8.1
+asttokens==2.4.1
+attrs==23.2.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+build==1.2.1
+cachetools==5.3.3
+certifi==2024.6.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.5.3
+click==8.1.7
+cmake==3.29.6
+colorama==0.4.6
+coloredlogs==15.0.1
+comm==0.2.2
+contourpy==1.2.1
+cryptography==42.0.8
+cycler==0.12.1
+dataclasses-json==0.6.7
+debugpy==1.8.2
+decorator==5.1.1
+deepdiff==7.0.1
+Deprecated==1.2.14
+distro==1.9.0
+dnspython==2.6.1
+docx2txt==0.8
+email_validator==2.2.0
+emoji==2.12.1
+et-xmlfile==1.1.0
+executing==2.0.1
+faiss-cpu==1.8.0.post1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+ffmpy==0.3.2
+filelock==3.15.4
+filetype==1.2.0
+flatbuffers==24.3.25
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.1
+google-auth==2.31.0
+googleapis-common-protos==1.63.2
+gradio==4.37.2
+gradio_client==1.0.2
+greenlet==3.0.3
+grpcio==1.64.1
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+humanfriendly==10.0
+idna==3.7
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+intel-openmp==2021.4.0
+iopath==0.1.10
+ipykernel==6.29.5
+ipython==8.26.0
+jedi==0.19.1
+Jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+jsonpointer==3.0.0
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+kiwisolver==1.4.5
+kubernetes==30.1.0
+langchain==0.2.6
+langchain-community==0.2.6
+langchain-core==0.2.10
+langchain-openai==0.1.13
+langchain-text-splitters==0.2.2
+langchainhub==0.1.20
+langdetect==1.0.9
+langsmith==0.1.83
+layoutparser==0.3.4
+lxml==5.2.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.3
+matplotlib==3.9.0
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mkl==2021.4.0
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.26.4
+oauthlib==3.2.2
+onnx==1.16.1
+onnxruntime==1.18.1
+openai==1.35.7
+opencv-python==4.10.0.84
+openpyxl==3.1.5
+opentelemetry-api==1.25.0
+opentelemetry-exporter-otlp-proto-common==1.25.0
+opentelemetry-exporter-otlp-proto-grpc==1.25.0
+opentelemetry-instrumentation==0.46b0
+opentelemetry-instrumentation-asgi==0.46b0
+opentelemetry-instrumentation-fastapi==0.46b0
+opentelemetry-proto==1.25.0
+opentelemetry-sdk==1.25.0
+opentelemetry-semantic-conventions==0.46b0
+opentelemetry-util-http==0.46b0
+ordered-set==4.1.0
+orjson==3.10.5
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pdf2image==1.17.0
+pdfminer.six==20231228
+pdfplumber==0.11.1
+pillow==10.4.0
+pillow_heif==0.17.0
+pkgconfig==1.5.5
+platformdirs==4.2.2
+poppler-utils==0.1.0
+portalocker==2.10.0
+posthog==3.5.0
+prompt_toolkit==3.0.47
+protobuf==4.25.3
+psutil==6.0.0
+pure-eval==0.2.2
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pycparser==2.22
+pycryptodome==3.20.0
+pydantic==2.8.0
+pydantic_core==2.20.0
+pydub==0.25.1
+Pygments==2.18.0
+pykg-config==1.3.0
+pyparsing==3.1.2
+pypdf==4.2.0
+pypdfium2==4.30.0
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+pyreadline3==3.4.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-iso639==2024.4.27
+python-magic==0.4.27
+python-multipart==0.0.9
+pytz==2024.1
+pywin32==306
+PyYAML==6.0.1
+pyzmq==26.0.3
+rapidfuzz==3.9.3
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.7.1
+rpds-py==0.18.1
+rsa==4.9
+ruff==0.5.0
+safetensors==0.4.3
+scipy==1.14.0
+semantic-version==2.10.0
+setuptools==70.1.1
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.31
+stack-data==0.6.3
+starlette==0.37.2
+sympy==1.12.1
+tabulate==0.9.0
+tbb==2021.13.0
+tenacity==8.4.2
+tiktoken==0.7.0
+timm==1.0.7
+tokenizers==0.19.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.1
+torchvision==0.18.1
+tornado==6.4.1
+tqdm==4.66.4
+traitlets==5.14.3
+transformers==4.42.3
+typer==0.12.3
+types-requests==2.32.0.20240622
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+unstructured==0.14.9
+unstructured-client==0.23.8
+unstructured-inference==0.7.36
+unstructured.pytesseract==0.3.12
+urllib3==2.2.2
+uvicorn==0.30.1
+watchfiles==0.22.0
+wcwidth==0.2.13
+websocket-client==1.8.0
+websockets==11.0.3
+wheel==0.43.0
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.19.2