HonestAnnie commited on
Commit
6c2cadb
·
1 Parent(s): b90a88d

delete old

Browse files
Files changed (4) hide show
  1. .gitattributes +0 -36
  2. app.py +0 -190
  3. chroma/chroma_setup.py +0 -55
  4. requirements.txt +0 -2
.gitattributes DELETED
@@ -1,36 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- *.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,190 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import chromadb
4
- from sentence_transformers import SentenceTransformer
5
- import spaces
6
-
7
- client = chromadb.PersistentClient(path="./chroma")
8
- collection_de = client.get_collection(name="phil_de")
9
- #collection_en = client.get_collection(name="phil_en")
10
- authors_list_de = ["Epikur", "Ludwig Wittgenstein", "Sigmund Freud", "Marcus Aurelius", "Friedrich Nietzsche", "Epiktet", "Ernst Jünger", "Georg Christoph Lichtenberg", "Balthasar Gracian", "Hannah Arendt", "Erich Fromm", "Albert Camus"]
11
- #authors_list_en = ["Friedrich Nietzsche", "Joscha Bach", "Hannah Arendt", "Albert Camus", "Mark Fisher"]
12
-
13
- @spaces.GPU
14
- def get_embeddings(queries, task):
15
- model = SentenceTransformer("Linq-AI-Research/Linq-Embed-Mistral", use_auth_token=os.getenv("HF_TOKEN"))
16
- prompts = [f"Instruct: {task}\nQuery: {query}" for query in queries]
17
- query_embeddings = model.encode(prompts)
18
- return query_embeddings
19
-
20
- def query_chroma(collection, embedding, authors):
21
- results = collection.query(
22
- query_embeddings=[embedding.tolist()],
23
- n_results=20,
24
- where={"author": {"$in": authors}} if authors else {},
25
- include=["documents", "metadatas", "distances"]
26
- )
27
-
28
- ids = results.get('ids', [[]])[0]
29
- metadatas = results.get('metadatas', [[]])[0]
30
- documents = results.get('documents', [[]])[0]
31
- distances = results.get('distances', [[]])[0]
32
-
33
- formatted_results = []
34
- for id_, metadata, document_text, distance in zip(ids, metadatas, documents, distances):
35
- result_dict = {
36
- "id": id_,
37
- "author": metadata.get('author', ''),
38
- "book": metadata.get('book', ''),
39
- "section": metadata.get('section', ''),
40
- "title": metadata.get('title', ''),
41
- "text": document_text,
42
- "distance": distance
43
- }
44
- formatted_results.append(result_dict)
45
-
46
- return formatted_results
47
-
48
-
49
- theme = gr.themes.Soft(
50
- primary_hue="indigo",
51
- secondary_hue="slate",
52
- neutral_hue="slate",
53
- spacing_size="lg",
54
- radius_size="lg",
55
- text_size="lg",
56
- font=["Helvetica", "sans-serif"],
57
- font_mono=["Courier", "monospace"],
58
- ).set(
59
- body_text_color="*neutral_800",
60
- block_background_fill="*neutral_50",
61
- block_border_width="0px",
62
- button_primary_background_fill="*primary_600",
63
- button_primary_background_fill_hover="*primary_700",
64
- button_primary_text_color="white",
65
- input_background_fill="white",
66
- input_border_color="*neutral_200",
67
- input_border_width="1px",
68
- checkbox_background_color_selected="*primary_600",
69
- checkbox_border_color_selected="*primary_600",
70
- )
71
-
72
- custom_css = """
73
- /* Remove outer padding, margins, and borders */
74
- gradio-app,
75
- gradio-app > div,
76
- gradio-app .gradio-container {
77
- padding: 0 !important;
78
- margin: 0 !important;
79
- border: none !important;
80
- }
81
-
82
- /* Remove any potential outlines */
83
- gradio-app:focus,
84
- gradio-app > div:focus,
85
- gradio-app .gradio-container:focus {
86
- outline: none !important;
87
- }
88
-
89
- /* Ensure full width */
90
- gradio-app {
91
- width: 100% !important;
92
- display: block !important;
93
- }
94
-
95
- .custom-markdown {
96
- border: 1px solid var(--neutral-200);
97
- padding: 10px;
98
- border-radius: var(--radius-lg);
99
- background-color: var(--color-background-primary);
100
- margin-bottom: 15px;
101
- }
102
- .custom-markdown p {
103
- margin-bottom: 10px;
104
- line-height: 1.6;
105
- }
106
-
107
- @media (max-width: 768px) {
108
- gradio-app,
109
- gradio-app > div,
110
- gradio-app .gradio-container {
111
- padding-left: 1px !important;
112
- padding-right: 1px !important;
113
- }
114
- .custom-markdown {
115
- padding: 5px;
116
- }
117
- .accordion {
118
- margin-left: -10px;
119
- margin-right: -10px;
120
- }
121
- }
122
- """
123
-
124
- with gr.Blocks(theme=theme, css=custom_css) as demo:
125
- gr.Markdown("Geben Sie ein, wonach Sie suchen möchten (Query), trennen Sie mehrere Suchanfragen durch Semikola; filtern Sie nach Autoren (ohne Auswahl werden alle durchsucht) und klicken Sie auf **Suchen**, um zu suchen.")
126
- #database_inp = gr.Dropdown(label="Database", choices=["German", "English"], value="German")
127
- author_inp = gr.Dropdown(label="Autoren", choices=authors_list_de, multiselect=True)
128
- inp = gr.Textbox(label="Query", lines=3, placeholder="Wie kann ich gesund leben?; Wie kann ich mich besser konzentrieren?; Was ist der Sinn des Lebens?; ...")
129
- btn = gr.Button("Suchen")
130
- loading_indicator = gr.Markdown(visible=False, elem_id="loading-indicator")
131
- results = gr.State()
132
-
133
- #def update_authors(database):
134
- # return gr.update(choices=authors_list_de if database == "German" else authors_list_en)
135
-
136
- #database_inp.change(
137
- # fn=lambda database: update_authors(database),
138
- # inputs=[database_inp],
139
- # outputs=[author_inp]
140
- #)
141
-
142
- def perform_query(queries, authors):
143
- task = "Suche den zur Frage passenden Text"
144
- queries = [query.strip() for query in queries.split(';')]
145
- embeddings = get_embeddings(queries, task)
146
- collection = collection_de
147
- results_data = []
148
- for query, embedding in zip(queries, embeddings):
149
- res = query_chroma(collection, embedding, authors)
150
- results_data.append((query, res))
151
- return results_data, ""
152
-
153
- btn.click(
154
- fn=lambda: ("", gr.update(visible=True)),
155
- inputs=None,
156
- outputs=[loading_indicator, loading_indicator],
157
- queue=False
158
- ).then(
159
- perform_query,
160
- inputs=[inp, author_inp],
161
- outputs=[results, loading_indicator]
162
- )
163
-
164
- @gr.render(inputs=[results])
165
- def display_accordion(data):
166
- for query, res in data:
167
- with gr.Accordion(query, open=False, elem_classes="accordion") as acc:
168
- for result in res:
169
- with gr.Column():
170
- author = str(result.get('author', ''))
171
- book = str(result.get('book', ''))
172
- section = str(result.get('section', ''))
173
- title = str(result.get('title', ''))
174
- text = str(result.get('text', ''))
175
-
176
- header_parts = []
177
- if author and author != "Unknown":
178
- header_parts.append(author)
179
- if book and book != "Unknown":
180
- header_parts.append(book)
181
- if section and section != "Unknown":
182
- header_parts.append(section)
183
- if title and title != "Unknown":
184
- header_parts.append(title)
185
-
186
- header = ", ".join(header_parts)
187
- markdown_contents = f"**{header}**\n\n{text}"
188
- gr.Markdown(value=markdown_contents, elem_classes="custom-markdown")
189
-
190
- demo.launch(inline=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
chroma/chroma_setup.py DELETED
@@ -1,55 +0,0 @@
1
- import os
2
- import json
3
- import chromadb
4
-
5
- # Initialize a persistent Chroma client
6
- client = chromadb.PersistentClient(path="/home/johannes/Desktop/proj/Datenbank/chroma")
7
-
8
- # Create or retrieve a collection for the books
9
- collection = client.get_or_create_collection(name="phil_en", metadata={"hnsw:space": "cosine"})
10
-
11
- # Function to safely get metadata, replacing None with "Unknown"
12
- def get_metadata(entry, key):
13
- return entry.get(key) if entry.get(key) is not None else "Unknown"
14
-
15
- # Directory containing the JSON files with pre-computed embeddings
16
- json_dir = "/home/johannes/Desktop/proj/Datenbank/bücher/en/verarbeitet/ready_for_chroma/"
17
-
18
- # Function to load JSON data from a file
19
- def load_json_data(file_path):
20
- with open(file_path, 'r', encoding='utf-8') as file:
21
- return json.load(file)
22
-
23
- # Get all JSON files in the directory
24
- json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
25
-
26
- # Loop through each file, read the data, and add it to the collection
27
- for file_path in json_files:
28
- try:
29
- data = load_json_data(file_path)
30
- documents = []
31
- embeddings = []
32
- metadatas = []
33
- ids = []
34
-
35
- # Extract entry information and embeddings from each object in the JSON file
36
- for entry in data:
37
- documents.append(entry['text'])
38
- embeddings.append(entry['embedding']) # Assume embeddings are stored under the key 'embedding'
39
- metadatas.append({
40
- 'author': get_metadata(entry, 'autor'),
41
- 'book': get_metadata(entry, 'buch'),
42
- 'section': get_metadata(entry, 'abschnitt'),
43
- 'title': get_metadata(entry, 'titel')
44
- })
45
- # Generating a structured ID for each entry
46
- entry_number = entry['entry_number'] # Ensure each JSON object has a entry number
47
- ids.append(f"{entry_number}")
48
-
49
- # Add the entrys to the collection with pre-computed embeddings
50
- collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids)
51
- print(f"Added {len(documents)} documents from {os.path.basename(file_path)}")
52
- except Exception as e:
53
- print(f"Failed to process {file_path}: {e}")
54
-
55
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- sentence_transformers
2
- chromadb