daneshjoy commited on
Commit
e230889
·
1 Parent(s): c8b3fc9

combined doc store creation and main app

Browse files
Files changed (1) hide show
  1. app.py +60 -2
app.py CHANGED
@@ -1,11 +1,69 @@
1
- import create_doc_store
2
-
3
  import os
4
 
5
  import streamlit as st
6
 
7
  from lfqa import prepare, answer
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Sliders
11
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
 
 
 
1
  import os
2
 
3
  import streamlit as st
4
 
5
  from lfqa import prepare, answer
6
 
7
+ # %% ------------------------------------------- Creating Doc store
8
+ if not os.path.exists(faiss_doc_store.db) or not os.path.exits(faiss_index.faiss):
9
+ from haystack.document_stores import FAISSDocumentStore
10
+ from haystack.nodes import DensePassageRetriever
11
+ from haystack.utils import convert_files_to_docs, clean_wiki_text
12
+
13
+
14
+ module_dir = os.path.dirname(os.path.abspath(__file__))
15
+ os.chdir(module_dir)
16
+
17
+ doc_dir = "data/wiki_gameofthrones_txt12"
18
+ sql_file = 'faiss_doc_store.db'
19
+ faiss_file = 'faiss_index.faiss'
20
+
21
+ # %% Download/Load Docs
22
+
23
+ # Get some files that we want to use
24
+ # s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
25
+ # fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
26
+
27
+ print('---> Loading Documents ...')
28
+
29
+ # Convert files to docs + cleaning
30
+ docs = convert_files_to_docs(dir_path=doc_dir,
31
+ clean_func=clean_wiki_text,
32
+ split_paragraphs=True)
33
+
34
+ # %% Document Store
35
+
36
+ print('---> Creating document store ...')
37
+ document_store = FAISSDocumentStore(embedding_dim=128,
38
+ faiss_index_factory_str="Flat",
39
+ sql_url=f"sqlite:///{sql_file}")
40
+
41
+
42
+
43
+ # %% Retriever (DPR)
44
+
45
+ print('---> Initializing retriever ...')
46
+ retriever = DensePassageRetriever(
47
+ document_store=document_store,
48
+ query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
49
+ passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
50
+ use_gpu=True
51
+ )
52
+
53
+ # %% Create Embeddings and save results
54
+ document_store.update_embeddings(retriever)
55
+
56
+ print('---> Saving results ...')
57
+ # update db
58
+ document_store.write_documents(docs)
59
+ # save faiss file
60
+ document_store.save(faiss_file)
61
+
62
+ print('Done!')
63
+
64
+
65
+ # %% ------------------------------------------- Main App
66
+
67
 
68
  # Sliders
69
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))