Spaces:
Sleeping
Sleeping
Commit
·
64fd6d4
1
Parent(s):
693072f
first commit
Browse files- app.py +81 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.docstore.document import Document
|
2 |
+
"""Core Modules s"""
|
3 |
+
from typing import Union, Optional, List, Sequence
|
4 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter, CharacterTextSplitter
|
6 |
+
from langchain.vectorstores.faiss import FAISS
|
7 |
+
from langchain_community.document_loaders import Docx2txtLoader
|
8 |
+
from langchain import hub
|
9 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
10 |
+
from langchain_core.runnables import RunnablePassthrough
|
11 |
+
from langchain_core.output_parsers import StrOutputParser
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
|
15 |
+
def doc_to_embeddings(doc:Document, split_mode:str='tiktoken',
|
16 |
+
chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None):
|
17 |
+
# Load the PDF file (if the file is a URL, load the PDF file from the URL)
|
18 |
+
|
19 |
+
# Split by separator and merge by character count
|
20 |
+
if split_mode == "character":
|
21 |
+
# Create a CharacterTextSplitter object
|
22 |
+
text_splitter = CharacterTextSplitter(
|
23 |
+
chunk_size=chunk_size,
|
24 |
+
chunk_overlap=chunk_overlap,
|
25 |
+
)
|
26 |
+
# Recursively split until below the chunk size limit
|
27 |
+
elif split_mode == "recursive_character":
|
28 |
+
# Create a RecursiveCharacterTextSplitter object
|
29 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
30 |
+
chunk_size=chunk_size,
|
31 |
+
chunk_overlap=chunk_overlap,
|
32 |
+
)
|
33 |
+
elif split_mode == "nltk":
|
34 |
+
# Create a NLTKTextSplitter object
|
35 |
+
text_splitter = NLTKTextSplitter(
|
36 |
+
chunk_size=chunk_size,
|
37 |
+
chunk_overlap=chunk_overlap,
|
38 |
+
)
|
39 |
+
elif split_mode == "tiktoken":
|
40 |
+
# Create a CharacterTextSplitter object
|
41 |
+
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
42 |
+
chunk_size=chunk_size,
|
43 |
+
chunk_overlap=chunk_overlap,)
|
44 |
+
else:
|
45 |
+
raise ValueError("Please specify the split mode.")
|
46 |
+
documents = text_splitter.split_documents(doc)
|
47 |
+
embeddings = OpenAIEmbeddings()
|
48 |
+
faiss_db = FAISS.from_documents(documents, embeddings)
|
49 |
+
if save_faiss:
|
50 |
+
faiss_db.save_local(faiss_save_path)
|
51 |
+
return faiss_db
|
52 |
+
|
53 |
+
def format_docs(docs):
|
54 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
55 |
+
|
56 |
+
|
57 |
+
def wrap_all(file, input_prompt:str):
|
58 |
+
loader = Docx2txtLoader(file.name)
|
59 |
+
data = loader.load()
|
60 |
+
db = doc_to_embeddings(data)
|
61 |
+
retriever = db.as_retriever()
|
62 |
+
prompt = hub.pull("rlm/rag-prompt")
|
63 |
+
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
|
64 |
+
rag_chain = (
|
65 |
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
66 |
+
| prompt
|
67 |
+
| llm
|
68 |
+
| StrOutputParser()
|
69 |
+
)
|
70 |
+
return rag_chain.invoke(input_prompt)
|
71 |
+
|
72 |
+
|
73 |
+
# Define the Gradio interface
|
74 |
+
iface = gr.Interface(
|
75 |
+
fn=wrap_all,
|
76 |
+
inputs=[gr.File(type="file", label=".docx file of the interview"), gr.Textbox(label="Enter your inquiry")],
|
77 |
+
outputs="text",
|
78 |
+
title="Interviews: QA and summarization",
|
79 |
+
description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.")
|
80 |
+
|
81 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
numpy
|
3 |
+
pandas
|
4 |
+
openai
|
5 |
+
openpyxl
|
6 |
+
langchain_community
|
7 |
+
langchain_openai
|
8 |
+
langchain_core
|