ryota39 commited on
Commit
6da9c31
·
verified ·
1 Parent(s): 972c3ed

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -0
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import streamlit as st
4
+ from langchain_community.vectorstores.faiss import FAISS
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+
7
+
8
+ st.set_page_config(page_title="ICLR2025 Paper Search", layout="wide")
9
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
10
+
11
+
12
+ @st.cache_resource
13
+ def create_vector_store(
14
+ vector_store_path: str,
15
+ embedding_model_name: str,
16
+ ) -> FAISS:
17
+ embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
18
+ vector_store = FAISS.load_local(
19
+ folder_path=vector_store_path,
20
+ embeddings=embedding_model,
21
+ allow_dangerous_deserialization=True,
22
+ )
23
+ return vector_store
24
+
25
+
26
+ def grab_topk(
27
+ input_text: str,
28
+ vector_store: FAISS,
29
+ top_k: int,
30
+ ) -> pd.DataFrame:
31
+ retriever = vector_store.as_retriever(search_kwargs={"k": top_k + 1})
32
+ relevant_docs = retriever.get_relevant_documents(input_text)
33
+
34
+ abstracts = list()
35
+ titles = list()
36
+ urls = list()
37
+ for relevant_doc in relevant_docs:
38
+ content = relevant_doc.page_content
39
+ url = content.split("<BEGIN_URL>")[-1].split("<END_URL>")[0]
40
+ abstract = content.split("\\n")[-1].split("<BEGIN_URL>")[0]
41
+ title = content.split("\\n")[0]
42
+
43
+ abstracts.append(abstract + "...")
44
+ titles.append(title)
45
+ urls.append(url)
46
+ return pd.DataFrame({"title": titles, "abstract": abstracts, "url": urls})
47
+
48
+
49
+ if __name__ == "__main__":
50
+ vector_store_path = "db"
51
+ embedding_model_name = "intfloat/multilingual-e5-large-instruct"
52
+ vector_store = create_vector_store(
53
+ vector_store_path,
54
+ embedding_model_name,
55
+ )
56
+
57
+ st.markdown("## ICLR2025")
58
+ st.markdown("- list of papers (https://iclr.cc/Downloads/2025)")
59
+ st.markdown(
60
+ "- repository (https://github.com/ohashi3399/paper-sonar?tab=readme-ov-file)"
61
+ )
62
+ input_text = st.text_input(
63
+ "query",
64
+ "",
65
+ placeholder="Enter the keywords you are interested in...",
66
+ )
67
+ top_k = st.number_input("top_k", min_value=1, value=10, step=1)
68
+
69
+ if st.button("検索"):
70
+ stripped_input_text = input_text.strip()
71
+ df = grab_topk(
72
+ stripped_input_text,
73
+ vector_store,
74
+ top_k,
75
+ )
76
+ st.table(df)