Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import hnswlib
|
3 |
+
from cryptography.fernet import Fernet
|
4 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import os
|
7 |
+
import gzip
|
8 |
+
import io
|
9 |
+
import pandas as pd
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
fernet = Fernet(os.environ.get("KEY").encode("utf-8"))
|
15 |
+
|
16 |
+
#read data
|
17 |
+
with gzip.open("title_metadata.gz",'rb') as f:
|
18 |
+
bytes_enc = f.read()
|
19 |
+
pq_bytes = fernet.decrypt(bytes_enc)
|
20 |
+
pq_file = io.BytesIO(pq_bytes)
|
21 |
+
meta_title = pd.read_parquet(pq_file)
|
22 |
+
|
23 |
+
#read data
|
24 |
+
with gzip.open("corpus_metadata.gz",'rb') as f:
|
25 |
+
bytes_enc = f.read()
|
26 |
+
pq_bytes = fernet.decrypt(bytes_enc)
|
27 |
+
pq_file = io.BytesIO(pq_bytes)
|
28 |
+
meta_corpus = pd.read_parquet(pq_file)
|
29 |
+
|
30 |
+
#load models
|
31 |
+
model = SentenceTransformer("KennethTM/MiniLM-L6-danish-encoder", device="cpu")
|
32 |
+
embedding_size = model.get_sentence_embedding_dimension()
|
33 |
+
|
34 |
+
crossencoder = CrossEncoder("KennethTM/MiniLM-L6-danish-reranker", device="cpu")
|
35 |
+
|
36 |
+
#set up indexes
|
37 |
+
title_index_path = "title.index"
|
38 |
+
title_index = hnswlib.Index(space = 'cosine', dim = embedding_size)
|
39 |
+
title_index.load_index(title_index_path)
|
40 |
+
title_index.set_ef(40)
|
41 |
+
|
42 |
+
corpus_index_path = "corpus.index"
|
43 |
+
corpus_index = hnswlib.Index(space = 'cosine', dim = embedding_size)
|
44 |
+
corpus_index.load_index(corpus_index_path)
|
45 |
+
corpus_index.set_ef(40)
|
46 |
+
|
47 |
+
#create dict with metadata and index
|
48 |
+
data = {"title": {"index": title_index, "meta": meta_title, "column": "title"},
|
49 |
+
"corpus": {"index": corpus_index, "meta": meta_corpus, "column": "text_chunks"}}
|
50 |
+
|
51 |
+
#init state dict
|
52 |
+
state = {"query": None}
|
53 |
+
|
54 |
+
#function find most similar candidates
|
55 |
+
def get_hits(query, index_name, top_k, top_k_multiplier = 2):
|
56 |
+
#get nearest neightbor ids
|
57 |
+
query_embedding = model.encode(query)
|
58 |
+
ids, _ = data[index_name]["index"].knn_query(query_embedding, k = int(top_k*top_k_multiplier))
|
59 |
+
ids = ids[0]
|
60 |
+
|
61 |
+
#rerank candidates
|
62 |
+
results = data[index_name]["meta"].iloc[ids].copy()
|
63 |
+
column_name = data[index_name]["column"]
|
64 |
+
rerank_list = [(query, i) for i in results[column_name]]
|
65 |
+
results["scores"] = crossencoder.predict(rerank_list)
|
66 |
+
results = results.sort_values("scores", ascending=False)
|
67 |
+
results = results[:int(top_k)]
|
68 |
+
|
69 |
+
return results
|
70 |
+
|
71 |
+
#functions for formatting hits
|
72 |
+
def format_hits(hits, index_name):
|
73 |
+
|
74 |
+
if index_name == "title":
|
75 |
+
formatted = "### {id}. " + hits['title'] + " ([Link til " + hits['type_label'] + "](" + hits['url'] + "))" + "\nSag ID: " + hits['id']
|
76 |
+
elif index_name == "corpus":
|
77 |
+
column_name = data[index_name]["column"]
|
78 |
+
formatted = "### {id}. " + hits['title'] + " ([Link til " + hits['type_label'] + "](" + hits['url'] + "))" + "\nSag ID: " + hits['id'] + "\n\n" + hits[column_name]
|
79 |
+
|
80 |
+
merged = "\n\n".join([text.format(id=i+1) for i, text in enumerate(formatted)])
|
81 |
+
merged = f"## Resultater\n{merged}"
|
82 |
+
|
83 |
+
return(merged)
|
84 |
+
|
85 |
+
#main entry function for search
|
86 |
+
def search(query, index_name, top_k):
|
87 |
+
|
88 |
+
hits = get_hits(query, index_name, top_k)
|
89 |
+
hits_formatted = format_hits(hits, index_name)
|
90 |
+
|
91 |
+
state["query"] = query
|
92 |
+
|
93 |
+
return(hits_formatted)
|
94 |
+
|
95 |
+
def update_description():
|
96 |
+
return(f"Nuværende søgning: {state['query']}")
|
97 |
+
|
98 |
+
def analyse_doc(id):
|
99 |
+
|
100 |
+
meta_doc = meta_title.query(f"id == '{id}'")
|
101 |
+
|
102 |
+
if meta_doc.empty or state["query"] is None:
|
103 |
+
return("<b>Ingen sager fundet...</b>")
|
104 |
+
|
105 |
+
html_body = meta_doc["text_html"].iloc[0]
|
106 |
+
html_title = meta_doc["title"].iloc[0]
|
107 |
+
html = f"<html>\n<body>\n<h1>{html_title}</h1>\n{html_body}\n</body>\n</html>"
|
108 |
+
|
109 |
+
soup = BeautifulSoup(html, 'lxml')
|
110 |
+
|
111 |
+
min_characters = 100
|
112 |
+
p_list = [i for i in soup.body.find_all('p', recursive=False) if len(i.get_text(strip=True)) > min_characters]
|
113 |
+
rerank_list = [(state["query"], i.get_text(strip=True)) for i in p_list]
|
114 |
+
rerank_scores = crossencoder.predict(rerank_list)
|
115 |
+
rerank_scores_norm = (rerank_scores - rerank_scores.min()) / (rerank_scores.max() - rerank_scores.min())
|
116 |
+
|
117 |
+
for element, score in zip(p_list, rerank_scores_norm):
|
118 |
+
element['style'] = f'background-color:rgba(173, 216, 230, {score});'
|
119 |
+
|
120 |
+
html_doc = f"""<iframe style="width: 100%; height: 480px" srcdoc='{str(soup)}'></iframe>"""
|
121 |
+
|
122 |
+
return(html_doc)
|
123 |
+
|
124 |
+
#define interface
|
125 |
+
with gr.Blocks() as demo:
|
126 |
+
gr.Markdown("# DEMO Semantisk søgning i Miljø- og Fødevareklagenævnets afgørelser")
|
127 |
+
|
128 |
+
with gr.Tab("Søgning"):
|
129 |
+
gr.Markdown("## Søgning i afgørelser")
|
130 |
+
gr.Markdown("Anvend søgefelt til at søge i titel eller tekst for afgørelser.")
|
131 |
+
gr.Markdown("Brug spørgsmål, kort beskrivelser eller stikord til søgningen.")
|
132 |
+
|
133 |
+
with gr.Row():
|
134 |
+
textbox = gr.Textbox(placeholder="Skriv her...", lines=1, label="Søgning", scale=6)
|
135 |
+
name = gr.Radio([("Titel", "title"), ("Tekst", "corpus")], value="corpus", label="Søgning i titel eller tekst?", scale=2)
|
136 |
+
num = gr.Number(5, label="Antal hits", scale=1)
|
137 |
+
btn = gr.Button("Søg!", size="sm", scale=1)
|
138 |
+
|
139 |
+
with gr.Row():
|
140 |
+
output = gr.Markdown()
|
141 |
+
|
142 |
+
btn.click(search, [textbox, name, num], output)
|
143 |
+
textbox.submit(search, [textbox, name, num], output)
|
144 |
+
|
145 |
+
with gr.Tab("Analyse"):
|
146 |
+
gr.Markdown("## Analyse af hele dokumenter")
|
147 |
+
description = gr.Markdown("Ingen søgning foretaget endnu - søg efter afgørelser ved at angive et sags ID.")
|
148 |
+
gr.Markdown("Relevans for tekst angives ved farveintensitet - mere blå er mere relevant.")
|
149 |
+
|
150 |
+
with gr.Row():
|
151 |
+
id_textbox = gr.Textbox(placeholder="Indsæt sags ID her...", lines=1, label="Søgning", scale=8)
|
152 |
+
id_btn = gr.Button("Søg!", size="sm", scale=2)
|
153 |
+
|
154 |
+
with gr.Row():
|
155 |
+
html_output = gr.HTML()
|
156 |
+
|
157 |
+
output.change(update_description, [], description)
|
158 |
+
id_btn.click(analyse_doc, id_textbox, html_output)
|
159 |
+
id_textbox.submit(analyse_doc, id_textbox, html_output)
|
160 |
+
|
161 |
+
demo.launch()
|