File size: 3,360 Bytes
70d7754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5be1a02
70d7754
 
5be1a02
 
 
70d7754
5be1a02
70d7754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5be1a02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d7754
 
 
 
 
 
 
 
5be1a02
70d7754
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from typing import Iterable
import streamlit as st
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
from config import DB_CONFIG


@st.cache_resource
def load_embeddings():
    model_name = "intfloat/multilingual-e5-large"
    model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
    encode_kwargs = {"normalize_embeddings": False}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return embeddings


EMBEDDINGS = load_embeddings()


def make_filter_obj(options: list[dict[str]]):
    must = []
    for option in options:
        must.append(
            FieldCondition(key=option["key"], match=MatchValue(value=option["value"]))
        )
    filter = Filter(must=must)
    return filter


def get_similay(query: str, filter: Filter):
    db_url, db_api_key, db_collection_name = DB_CONFIG
    client = QdrantClient(url=db_url, api_key=db_api_key)
    db = Qdrant(
        client=client, collection_name=db_collection_name, embeddings=EMBEDDINGS
    )
    docs = db.similarity_search_with_score(
        query,
        k=20,
        filter=filter,
    )
    return docs


def main(
    query: str,
    repo_name: str,
    query_options: str,
) -> Iterable[tuple[str, tuple[str, str]]]:
    options = [{"key": "metadata.repo_name", "value": repo_name}]
    if query_options == "Empty":
        query_options = ""
    query_str = f"{query_options}{query}"
    filter = make_filter_obj(options=options)
    docs = get_similay(query_str, filter)
    for doc, score in docs:
        text = doc.page_content
        metadata = doc.metadata
        # print(metadata)
        title = metadata.get("title")
        url = metadata.get("url")
        id_ = metadata.get("id")
        is_comment = metadata.get("type_") == "comment"
        yield title, url, id_, text, score, is_comment


with st.form("my_form"):
    st.title("GitHub Issue Search")
    query = st.text_input(label="query")
    repo_name = st.radio(
        options=[
            "cpython",
            "pyvista",
            "plone",
            "volto",
            "plone.restapi",
            "nvda",
            "nvdajp",
            "cocoa",
        ],
        label="Repo name",
    )
    query_options = st.radio(
        options=[
            "query: ",
            "query: passage: ",
            "Empty",
        ],
        label="Query options",
    )

    submitted = st.form_submit_button("Submit")
    if submitted:
        st.divider()
        st.header("Search Results")
        st.divider()
        with st.spinner("Searching..."):
            results = main(query, repo_name, query_options)
            for title, url, id_, text, score, is_comment in results:
                with st.container():
                    if not is_comment:
                        st.subheader(f"#{id_} - {title}")
                    else:
                        st.subheader(f"comment with {title}")
                    st.write(url)
                    st.write(text)
                    st.write(score)
                    # st.markdown(html, unsafe_allow_html=True)
                    st.divider()