Spaces:
Running
Running
Commit
·
6c2cadb
1
Parent(s):
b90a88d
delete old
Browse files- .gitattributes +0 -36
- app.py +0 -190
- chroma/chroma_setup.py +0 -55
- requirements.txt +0 -2
.gitattributes
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
@@ -1,190 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import gradio as gr
|
3 |
-
import chromadb
|
4 |
-
from sentence_transformers import SentenceTransformer
|
5 |
-
import spaces
|
6 |
-
|
7 |
-
client = chromadb.PersistentClient(path="./chroma")
|
8 |
-
collection_de = client.get_collection(name="phil_de")
|
9 |
-
#collection_en = client.get_collection(name="phil_en")
|
10 |
-
authors_list_de = ["Epikur", "Ludwig Wittgenstein", "Sigmund Freud", "Marcus Aurelius", "Friedrich Nietzsche", "Epiktet", "Ernst Jünger", "Georg Christoph Lichtenberg", "Balthasar Gracian", "Hannah Arendt", "Erich Fromm", "Albert Camus"]
|
11 |
-
#authors_list_en = ["Friedrich Nietzsche", "Joscha Bach", "Hannah Arendt", "Albert Camus", "Mark Fisher"]
|
12 |
-
|
13 |
-
@spaces.GPU
|
14 |
-
def get_embeddings(queries, task):
|
15 |
-
model = SentenceTransformer("Linq-AI-Research/Linq-Embed-Mistral", use_auth_token=os.getenv("HF_TOKEN"))
|
16 |
-
prompts = [f"Instruct: {task}\nQuery: {query}" for query in queries]
|
17 |
-
query_embeddings = model.encode(prompts)
|
18 |
-
return query_embeddings
|
19 |
-
|
20 |
-
def query_chroma(collection, embedding, authors):
|
21 |
-
results = collection.query(
|
22 |
-
query_embeddings=[embedding.tolist()],
|
23 |
-
n_results=20,
|
24 |
-
where={"author": {"$in": authors}} if authors else {},
|
25 |
-
include=["documents", "metadatas", "distances"]
|
26 |
-
)
|
27 |
-
|
28 |
-
ids = results.get('ids', [[]])[0]
|
29 |
-
metadatas = results.get('metadatas', [[]])[0]
|
30 |
-
documents = results.get('documents', [[]])[0]
|
31 |
-
distances = results.get('distances', [[]])[0]
|
32 |
-
|
33 |
-
formatted_results = []
|
34 |
-
for id_, metadata, document_text, distance in zip(ids, metadatas, documents, distances):
|
35 |
-
result_dict = {
|
36 |
-
"id": id_,
|
37 |
-
"author": metadata.get('author', ''),
|
38 |
-
"book": metadata.get('book', ''),
|
39 |
-
"section": metadata.get('section', ''),
|
40 |
-
"title": metadata.get('title', ''),
|
41 |
-
"text": document_text,
|
42 |
-
"distance": distance
|
43 |
-
}
|
44 |
-
formatted_results.append(result_dict)
|
45 |
-
|
46 |
-
return formatted_results
|
47 |
-
|
48 |
-
|
49 |
-
theme = gr.themes.Soft(
|
50 |
-
primary_hue="indigo",
|
51 |
-
secondary_hue="slate",
|
52 |
-
neutral_hue="slate",
|
53 |
-
spacing_size="lg",
|
54 |
-
radius_size="lg",
|
55 |
-
text_size="lg",
|
56 |
-
font=["Helvetica", "sans-serif"],
|
57 |
-
font_mono=["Courier", "monospace"],
|
58 |
-
).set(
|
59 |
-
body_text_color="*neutral_800",
|
60 |
-
block_background_fill="*neutral_50",
|
61 |
-
block_border_width="0px",
|
62 |
-
button_primary_background_fill="*primary_600",
|
63 |
-
button_primary_background_fill_hover="*primary_700",
|
64 |
-
button_primary_text_color="white",
|
65 |
-
input_background_fill="white",
|
66 |
-
input_border_color="*neutral_200",
|
67 |
-
input_border_width="1px",
|
68 |
-
checkbox_background_color_selected="*primary_600",
|
69 |
-
checkbox_border_color_selected="*primary_600",
|
70 |
-
)
|
71 |
-
|
72 |
-
custom_css = """
|
73 |
-
/* Remove outer padding, margins, and borders */
|
74 |
-
gradio-app,
|
75 |
-
gradio-app > div,
|
76 |
-
gradio-app .gradio-container {
|
77 |
-
padding: 0 !important;
|
78 |
-
margin: 0 !important;
|
79 |
-
border: none !important;
|
80 |
-
}
|
81 |
-
|
82 |
-
/* Remove any potential outlines */
|
83 |
-
gradio-app:focus,
|
84 |
-
gradio-app > div:focus,
|
85 |
-
gradio-app .gradio-container:focus {
|
86 |
-
outline: none !important;
|
87 |
-
}
|
88 |
-
|
89 |
-
/* Ensure full width */
|
90 |
-
gradio-app {
|
91 |
-
width: 100% !important;
|
92 |
-
display: block !important;
|
93 |
-
}
|
94 |
-
|
95 |
-
.custom-markdown {
|
96 |
-
border: 1px solid var(--neutral-200);
|
97 |
-
padding: 10px;
|
98 |
-
border-radius: var(--radius-lg);
|
99 |
-
background-color: var(--color-background-primary);
|
100 |
-
margin-bottom: 15px;
|
101 |
-
}
|
102 |
-
.custom-markdown p {
|
103 |
-
margin-bottom: 10px;
|
104 |
-
line-height: 1.6;
|
105 |
-
}
|
106 |
-
|
107 |
-
@media (max-width: 768px) {
|
108 |
-
gradio-app,
|
109 |
-
gradio-app > div,
|
110 |
-
gradio-app .gradio-container {
|
111 |
-
padding-left: 1px !important;
|
112 |
-
padding-right: 1px !important;
|
113 |
-
}
|
114 |
-
.custom-markdown {
|
115 |
-
padding: 5px;
|
116 |
-
}
|
117 |
-
.accordion {
|
118 |
-
margin-left: -10px;
|
119 |
-
margin-right: -10px;
|
120 |
-
}
|
121 |
-
}
|
122 |
-
"""
|
123 |
-
|
124 |
-
with gr.Blocks(theme=theme, css=custom_css) as demo:
|
125 |
-
gr.Markdown("Geben Sie ein, wonach Sie suchen möchten (Query), trennen Sie mehrere Suchanfragen durch Semikola; filtern Sie nach Autoren (ohne Auswahl werden alle durchsucht) und klicken Sie auf **Suchen**, um zu suchen.")
|
126 |
-
#database_inp = gr.Dropdown(label="Database", choices=["German", "English"], value="German")
|
127 |
-
author_inp = gr.Dropdown(label="Autoren", choices=authors_list_de, multiselect=True)
|
128 |
-
inp = gr.Textbox(label="Query", lines=3, placeholder="Wie kann ich gesund leben?; Wie kann ich mich besser konzentrieren?; Was ist der Sinn des Lebens?; ...")
|
129 |
-
btn = gr.Button("Suchen")
|
130 |
-
loading_indicator = gr.Markdown(visible=False, elem_id="loading-indicator")
|
131 |
-
results = gr.State()
|
132 |
-
|
133 |
-
#def update_authors(database):
|
134 |
-
# return gr.update(choices=authors_list_de if database == "German" else authors_list_en)
|
135 |
-
|
136 |
-
#database_inp.change(
|
137 |
-
# fn=lambda database: update_authors(database),
|
138 |
-
# inputs=[database_inp],
|
139 |
-
# outputs=[author_inp]
|
140 |
-
#)
|
141 |
-
|
142 |
-
def perform_query(queries, authors):
|
143 |
-
task = "Suche den zur Frage passenden Text"
|
144 |
-
queries = [query.strip() for query in queries.split(';')]
|
145 |
-
embeddings = get_embeddings(queries, task)
|
146 |
-
collection = collection_de
|
147 |
-
results_data = []
|
148 |
-
for query, embedding in zip(queries, embeddings):
|
149 |
-
res = query_chroma(collection, embedding, authors)
|
150 |
-
results_data.append((query, res))
|
151 |
-
return results_data, ""
|
152 |
-
|
153 |
-
btn.click(
|
154 |
-
fn=lambda: ("", gr.update(visible=True)),
|
155 |
-
inputs=None,
|
156 |
-
outputs=[loading_indicator, loading_indicator],
|
157 |
-
queue=False
|
158 |
-
).then(
|
159 |
-
perform_query,
|
160 |
-
inputs=[inp, author_inp],
|
161 |
-
outputs=[results, loading_indicator]
|
162 |
-
)
|
163 |
-
|
164 |
-
@gr.render(inputs=[results])
|
165 |
-
def display_accordion(data):
|
166 |
-
for query, res in data:
|
167 |
-
with gr.Accordion(query, open=False, elem_classes="accordion") as acc:
|
168 |
-
for result in res:
|
169 |
-
with gr.Column():
|
170 |
-
author = str(result.get('author', ''))
|
171 |
-
book = str(result.get('book', ''))
|
172 |
-
section = str(result.get('section', ''))
|
173 |
-
title = str(result.get('title', ''))
|
174 |
-
text = str(result.get('text', ''))
|
175 |
-
|
176 |
-
header_parts = []
|
177 |
-
if author and author != "Unknown":
|
178 |
-
header_parts.append(author)
|
179 |
-
if book and book != "Unknown":
|
180 |
-
header_parts.append(book)
|
181 |
-
if section and section != "Unknown":
|
182 |
-
header_parts.append(section)
|
183 |
-
if title and title != "Unknown":
|
184 |
-
header_parts.append(title)
|
185 |
-
|
186 |
-
header = ", ".join(header_parts)
|
187 |
-
markdown_contents = f"**{header}**\n\n{text}"
|
188 |
-
gr.Markdown(value=markdown_contents, elem_classes="custom-markdown")
|
189 |
-
|
190 |
-
demo.launch(inline=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chroma/chroma_setup.py
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import json
|
3 |
-
import chromadb
|
4 |
-
|
5 |
-
# Initialize a persistent Chroma client
|
6 |
-
client = chromadb.PersistentClient(path="/home/johannes/Desktop/proj/Datenbank/chroma")
|
7 |
-
|
8 |
-
# Create or retrieve a collection for the books
|
9 |
-
collection = client.get_or_create_collection(name="phil_en", metadata={"hnsw:space": "cosine"})
|
10 |
-
|
11 |
-
# Function to safely get metadata, replacing None with "Unknown"
|
12 |
-
def get_metadata(entry, key):
|
13 |
-
return entry.get(key) if entry.get(key) is not None else "Unknown"
|
14 |
-
|
15 |
-
# Directory containing the JSON files with pre-computed embeddings
|
16 |
-
json_dir = "/home/johannes/Desktop/proj/Datenbank/bücher/en/verarbeitet/ready_for_chroma/"
|
17 |
-
|
18 |
-
# Function to load JSON data from a file
|
19 |
-
def load_json_data(file_path):
|
20 |
-
with open(file_path, 'r', encoding='utf-8') as file:
|
21 |
-
return json.load(file)
|
22 |
-
|
23 |
-
# Get all JSON files in the directory
|
24 |
-
json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
|
25 |
-
|
26 |
-
# Loop through each file, read the data, and add it to the collection
|
27 |
-
for file_path in json_files:
|
28 |
-
try:
|
29 |
-
data = load_json_data(file_path)
|
30 |
-
documents = []
|
31 |
-
embeddings = []
|
32 |
-
metadatas = []
|
33 |
-
ids = []
|
34 |
-
|
35 |
-
# Extract entry information and embeddings from each object in the JSON file
|
36 |
-
for entry in data:
|
37 |
-
documents.append(entry['text'])
|
38 |
-
embeddings.append(entry['embedding']) # Assume embeddings are stored under the key 'embedding'
|
39 |
-
metadatas.append({
|
40 |
-
'author': get_metadata(entry, 'autor'),
|
41 |
-
'book': get_metadata(entry, 'buch'),
|
42 |
-
'section': get_metadata(entry, 'abschnitt'),
|
43 |
-
'title': get_metadata(entry, 'titel')
|
44 |
-
})
|
45 |
-
# Generating a structured ID for each entry
|
46 |
-
entry_number = entry['entry_number'] # Ensure each JSON object has a entry number
|
47 |
-
ids.append(f"{entry_number}")
|
48 |
-
|
49 |
-
# Add the entrys to the collection with pre-computed embeddings
|
50 |
-
collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids)
|
51 |
-
print(f"Added {len(documents)} documents from {os.path.basename(file_path)}")
|
52 |
-
except Exception as e:
|
53 |
-
print(f"Failed to process {file_path}: {e}")
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
sentence_transformers
|
2 |
-
chromadb
|
|
|
|
|
|