KennethTM commited on
Commit
0669d1e
·
verified ·
1 Parent(s): 752e648

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import hnswlib
3
+ from cryptography.fernet import Fernet
4
+ from sentence_transformers import SentenceTransformer, CrossEncoder
5
+ from dotenv import load_dotenv
6
+ import os
7
+ import gzip
8
+ import io
9
+ import pandas as pd
10
+ from bs4 import BeautifulSoup
11
+
12
+ load_dotenv()
13
+
14
+ fernet = Fernet(os.environ.get("KEY").encode("utf-8"))
15
+
16
+ #read data
17
+ with gzip.open("title_metadata.gz",'rb') as f:
18
+ bytes_enc = f.read()
19
+ pq_bytes = fernet.decrypt(bytes_enc)
20
+ pq_file = io.BytesIO(pq_bytes)
21
+ meta_title = pd.read_parquet(pq_file)
22
+
23
+ #read data
24
+ with gzip.open("corpus_metadata.gz",'rb') as f:
25
+ bytes_enc = f.read()
26
+ pq_bytes = fernet.decrypt(bytes_enc)
27
+ pq_file = io.BytesIO(pq_bytes)
28
+ meta_corpus = pd.read_parquet(pq_file)
29
+
30
+ #load models
31
+ model = SentenceTransformer("KennethTM/MiniLM-L6-danish-encoder", device="cpu")
32
+ embedding_size = model.get_sentence_embedding_dimension()
33
+
34
+ crossencoder = CrossEncoder("KennethTM/MiniLM-L6-danish-reranker", device="cpu")
35
+
36
+ #set up indexes
37
+ title_index_path = "title.index"
38
+ title_index = hnswlib.Index(space = 'cosine', dim = embedding_size)
39
+ title_index.load_index(title_index_path)
40
+ title_index.set_ef(40)
41
+
42
+ corpus_index_path = "corpus.index"
43
+ corpus_index = hnswlib.Index(space = 'cosine', dim = embedding_size)
44
+ corpus_index.load_index(corpus_index_path)
45
+ corpus_index.set_ef(40)
46
+
47
+ #create dict with metadata and index
48
+ data = {"title": {"index": title_index, "meta": meta_title, "column": "title"},
49
+ "corpus": {"index": corpus_index, "meta": meta_corpus, "column": "text_chunks"}}
50
+
51
+ #init state dict
52
+ state = {"query": None}
53
+
54
+ #function find most similar candidates
55
+ def get_hits(query, index_name, top_k, top_k_multiplier = 2):
56
+ #get nearest neightbor ids
57
+ query_embedding = model.encode(query)
58
+ ids, _ = data[index_name]["index"].knn_query(query_embedding, k = int(top_k*top_k_multiplier))
59
+ ids = ids[0]
60
+
61
+ #rerank candidates
62
+ results = data[index_name]["meta"].iloc[ids].copy()
63
+ column_name = data[index_name]["column"]
64
+ rerank_list = [(query, i) for i in results[column_name]]
65
+ results["scores"] = crossencoder.predict(rerank_list)
66
+ results = results.sort_values("scores", ascending=False)
67
+ results = results[:int(top_k)]
68
+
69
+ return results
70
+
71
+ #functions for formatting hits
72
+ def format_hits(hits, index_name):
73
+
74
+ if index_name == "title":
75
+ formatted = "### {id}. " + hits['title'] + " ([Link til " + hits['type_label'] + "](" + hits['url'] + "))" + "\nSag ID: " + hits['id']
76
+ elif index_name == "corpus":
77
+ column_name = data[index_name]["column"]
78
+ formatted = "### {id}. " + hits['title'] + " ([Link til " + hits['type_label'] + "](" + hits['url'] + "))" + "\nSag ID: " + hits['id'] + "\n\n" + hits[column_name]
79
+
80
+ merged = "\n\n".join([text.format(id=i+1) for i, text in enumerate(formatted)])
81
+ merged = f"## Resultater\n{merged}"
82
+
83
+ return(merged)
84
+
85
+ #main entry function for search
86
+ def search(query, index_name, top_k):
87
+
88
+ hits = get_hits(query, index_name, top_k)
89
+ hits_formatted = format_hits(hits, index_name)
90
+
91
+ state["query"] = query
92
+
93
+ return(hits_formatted)
94
+
95
+ def update_description():
96
+ return(f"Nuværende søgning: {state['query']}")
97
+
98
+ def analyse_doc(id):
99
+
100
+ meta_doc = meta_title.query(f"id == '{id}'")
101
+
102
+ if meta_doc.empty or state["query"] is None:
103
+ return("<b>Ingen sager fundet...</b>")
104
+
105
+ html_body = meta_doc["text_html"].iloc[0]
106
+ html_title = meta_doc["title"].iloc[0]
107
+ html = f"<html>\n<body>\n<h1>{html_title}</h1>\n{html_body}\n</body>\n</html>"
108
+
109
+ soup = BeautifulSoup(html, 'lxml')
110
+
111
+ min_characters = 100
112
+ p_list = [i for i in soup.body.find_all('p', recursive=False) if len(i.get_text(strip=True)) > min_characters]
113
+ rerank_list = [(state["query"], i.get_text(strip=True)) for i in p_list]
114
+ rerank_scores = crossencoder.predict(rerank_list)
115
+ rerank_scores_norm = (rerank_scores - rerank_scores.min()) / (rerank_scores.max() - rerank_scores.min())
116
+
117
+ for element, score in zip(p_list, rerank_scores_norm):
118
+ element['style'] = f'background-color:rgba(173, 216, 230, {score});'
119
+
120
+ html_doc = f"""<iframe style="width: 100%; height: 480px" srcdoc='{str(soup)}'></iframe>"""
121
+
122
+ return(html_doc)
123
+
124
+ #define interface
125
+ with gr.Blocks() as demo:
126
+ gr.Markdown("# DEMO Semantisk søgning i Miljø- og Fødevareklagenævnets afgørelser")
127
+
128
+ with gr.Tab("Søgning"):
129
+ gr.Markdown("## Søgning i afgørelser")
130
+ gr.Markdown("Anvend søgefelt til at søge i titel eller tekst for afgørelser.")
131
+ gr.Markdown("Brug spørgsmål, kort beskrivelser eller stikord til søgningen.")
132
+
133
+ with gr.Row():
134
+ textbox = gr.Textbox(placeholder="Skriv her...", lines=1, label="Søgning", scale=6)
135
+ name = gr.Radio([("Titel", "title"), ("Tekst", "corpus")], value="corpus", label="Søgning i titel eller tekst?", scale=2)
136
+ num = gr.Number(5, label="Antal hits", scale=1)
137
+ btn = gr.Button("Søg!", size="sm", scale=1)
138
+
139
+ with gr.Row():
140
+ output = gr.Markdown()
141
+
142
+ btn.click(search, [textbox, name, num], output)
143
+ textbox.submit(search, [textbox, name, num], output)
144
+
145
+ with gr.Tab("Analyse"):
146
+ gr.Markdown("## Analyse af hele dokumenter")
147
+ description = gr.Markdown("Ingen søgning foretaget endnu - søg efter afgørelser ved at angive et sags ID.")
148
+ gr.Markdown("Relevans for tekst angives ved farveintensitet - mere blå er mere relevant.")
149
+
150
+ with gr.Row():
151
+ id_textbox = gr.Textbox(placeholder="Indsæt sags ID her...", lines=1, label="Søgning", scale=8)
152
+ id_btn = gr.Button("Søg!", size="sm", scale=2)
153
+
154
+ with gr.Row():
155
+ html_output = gr.HTML()
156
+
157
+ output.change(update_description, [], description)
158
+ id_btn.click(analyse_doc, id_textbox, html_output)
159
+ id_textbox.submit(analyse_doc, id_textbox, html_output)
160
+
161
+ demo.launch()