tomascufaro commited on
Commit
64fd6d4
·
1 Parent(s): 693072f

first commit

Browse files
Files changed (2) hide show
  1. app.py +81 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.docstore.document import Document
2
+ """Core Modules s"""
3
+ from typing import Union, Optional, List, Sequence
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter, CharacterTextSplitter
6
+ from langchain.vectorstores.faiss import FAISS
7
+ from langchain_community.document_loaders import Docx2txtLoader
8
+ from langchain import hub
9
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
10
+ from langchain_core.runnables import RunnablePassthrough
11
+ from langchain_core.output_parsers import StrOutputParser
12
+ import gradio as gr
13
+
14
+
15
+ def doc_to_embeddings(doc:Document, split_mode:str='tiktoken',
16
+ chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None):
17
+ # Load the PDF file (if the file is a URL, load the PDF file from the URL)
18
+
19
+ # Split by separator and merge by character count
20
+ if split_mode == "character":
21
+ # Create a CharacterTextSplitter object
22
+ text_splitter = CharacterTextSplitter(
23
+ chunk_size=chunk_size,
24
+ chunk_overlap=chunk_overlap,
25
+ )
26
+ # Recursively split until below the chunk size limit
27
+ elif split_mode == "recursive_character":
28
+ # Create a RecursiveCharacterTextSplitter object
29
+ text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=chunk_size,
31
+ chunk_overlap=chunk_overlap,
32
+ )
33
+ elif split_mode == "nltk":
34
+ # Create a NLTKTextSplitter object
35
+ text_splitter = NLTKTextSplitter(
36
+ chunk_size=chunk_size,
37
+ chunk_overlap=chunk_overlap,
38
+ )
39
+ elif split_mode == "tiktoken":
40
+ # Create a CharacterTextSplitter object
41
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap,)
44
+ else:
45
+ raise ValueError("Please specify the split mode.")
46
+ documents = text_splitter.split_documents(doc)
47
+ embeddings = OpenAIEmbeddings()
48
+ faiss_db = FAISS.from_documents(documents, embeddings)
49
+ if save_faiss:
50
+ faiss_db.save_local(faiss_save_path)
51
+ return faiss_db
52
+
53
+ def format_docs(docs):
54
+ return "\n\n".join(doc.page_content for doc in docs)
55
+
56
+
57
+ def wrap_all(file, input_prompt:str):
58
+ loader = Docx2txtLoader(file.name)
59
+ data = loader.load()
60
+ db = doc_to_embeddings(data)
61
+ retriever = db.as_retriever()
62
+ prompt = hub.pull("rlm/rag-prompt")
63
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
64
+ rag_chain = (
65
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
66
+ | prompt
67
+ | llm
68
+ | StrOutputParser()
69
+ )
70
+ return rag_chain.invoke(input_prompt)
71
+
72
+
73
+ # Define the Gradio interface
74
+ iface = gr.Interface(
75
+ fn=wrap_all,
76
+ inputs=[gr.File(type="file", label=".docx file of the interview"), gr.Textbox(label="Enter your inquiry")],
77
+ outputs="text",
78
+ title="Interviews: QA and summarization",
79
+ description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.")
80
+
81
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ numpy
3
+ pandas
4
+ openai
5
+ openpyxl
6
+ langchain_community
7
+ langchain_openai
8
+ langchain_core