TomData commited on
Commit
38886c5
·
1 Parent(s): 6d01321

Change repo structure to adapt to HF new space GB limit

Browse files
Home.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
 
3
  #from gradio_calendar import Calendar
4
  #from datetime import datetime
5
 
@@ -13,6 +14,11 @@ from src.chatbot import chatbot, keyword_search
13
  # login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
14
 
15
 
 
 
 
 
 
16
 
17
  # Define important variables
18
  legislature_periods = [
@@ -42,6 +48,7 @@ legislature_periods = [
42
  partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
43
 
44
 
 
45
 
46
  with gr.Blocks() as App:
47
  with gr.Tab("ChatBot"):
 
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
3
+ from huggingface_hub import snapshot_download
4
  #from gradio_calendar import Calendar
5
  #from datetime import datetime
6
 
 
14
  # login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
15
 
16
 
17
+ # Retrieve Vectorstore
18
+ REPO_ID = "TomData/test"
19
+ LOCAL_DIR = "src/FAISS"
20
+ snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_DIR, repo_type="dataset")
21
+
22
 
23
  # Define important variables
24
  legislature_periods = [
 
48
  partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
49
 
50
 
51
+ # Define Gradio App Layout
52
 
53
  with gr.Blocks() as App:
54
  with gr.Tab("ChatBot"):
src/{FAISS.ipynb → FAISS/FAISS.ipynb} RENAMED
@@ -2,11 +2,13 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
  "import pandas as pd\n",
 
 
10
  "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
11
  "from langchain_community.document_loaders import DataFrameLoader\n",
12
  "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
@@ -19,7 +21,58 @@
19
  "cell_type": "markdown",
20
  "metadata": {},
21
  "source": [
22
- "### Load the whole speeches data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ]
24
  },
25
  {
@@ -173,7 +226,7 @@
173
  }
174
  ],
175
  "source": [
176
- "df = pd.read_pickle(r\"C:\\Users\\Tom\\OneDrive\\Dokumente\\Lokal\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
177
  "df['date'] = pd.to_datetime(df['date'])\n"
178
  ]
179
  },
@@ -304,6 +357,13 @@
304
  "\n",
305
  " \n"
306
  ]
 
 
 
 
 
 
 
307
  }
308
  ],
309
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
  "import pandas as pd\n",
10
+ "import psycopg2\n",
11
+ "\n",
12
  "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
13
  "from langchain_community.document_loaders import DataFrameLoader\n",
14
  "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
 
21
  "cell_type": "markdown",
22
  "metadata": {},
23
  "source": [
24
+ "### Retrieve Speeches"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# db_connection -----------------------------------------------------------\n",
34
+ "con_details = {\n",
35
+ " \"host\" : \"localhost\",\n",
36
+ " \"database\" : \"next\",\n",
37
+ " \"user\" : \"postgres\",\n",
38
+ " \"password\" : \"postgres\",\n",
39
+ " \"port\" : \"5433\"\n",
40
+ "}\n",
41
+ "con = psycopg2.connect(**con_details)\n",
42
+ "\n",
43
+ "# get data tables ---------------------------------------------------------\n",
44
+ "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
45
+ " FROM open_discourse.speeches AS s\n",
46
+ " INNER JOIN open_discourse.factions AS f ON\n",
47
+ " s.faction_id = f.id;\"\"\", con)"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {},
53
+ "source": [
54
+ "### Process speeches"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "print(set(df['party'].to_list()))"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "# Removing keys from interruptions of a speech\n",
73
+ "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) \n",
74
+ "df['date'] = pd.to_datetime(df['date'])\n",
75
+ "df"
76
  ]
77
  },
78
  {
 
226
  }
227
  ],
228
  "source": [
229
+ "# Convert to proper time format\n",
230
  "df['date'] = pd.to_datetime(df['date'])\n"
231
  ]
232
  },
 
357
  "\n",
358
  " \n"
359
  ]
360
+ },
361
+ {
362
+ "cell_type": "markdown",
363
+ "metadata": {},
364
+ "source": [
365
+ "This data has been uploaded to: https://huggingface.co/datasets/TomData/test"
366
+ ]
367
  }
368
  ],
369
  "metadata": {
src/Speeches/query.ipynb DELETED
@@ -1,267 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import psycopg2\n",
10
- "import pandas as pd"
11
- ]
12
- },
13
- {
14
- "cell_type": "markdown",
15
- "metadata": {},
16
- "source": [
17
- "### Pandas\n"
18
- ]
19
- },
20
- {
21
- "cell_type": "code",
22
- "execution_count": 13,
23
- "metadata": {},
24
- "outputs": [
25
- {
26
- "name": "stderr",
27
- "output_type": "stream",
28
- "text": [
29
- "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_12368\\2515868855.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
30
- " df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
31
- ]
32
- }
33
- ],
34
- "source": [
35
- "# db_connection -----------------------------------------------------------\n",
36
- "con_details = {\n",
37
- " \"host\" : \"localhost\",\n",
38
- " \"database\" : \"next\",\n",
39
- " \"user\" : \"postgres\",\n",
40
- " \"password\" : \"postgres\",\n",
41
- " \"port\" : \"5433\"\n",
42
- "}\n",
43
- "con = psycopg2.connect(**con_details)\n",
44
- "\n",
45
- "# get data tables ---------------------------------------------------------\n",
46
- "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
47
- " FROM open_discourse.speeches AS s\n",
48
- " INNER JOIN open_discourse.factions AS f ON\n",
49
- " s.faction_id = f.id;\"\"\", con)\n",
50
- "\n",
51
- "\n"
52
- ]
53
- },
54
- {
55
- "cell_type": "markdown",
56
- "metadata": {},
57
- "source": [
58
- "### Data Cleaning"
59
- ]
60
- },
61
- {
62
- "cell_type": "code",
63
- "execution_count": 14,
64
- "metadata": {},
65
- "outputs": [
66
- {
67
- "name": "stdout",
68
- "output_type": "stream",
69
- "text": [
70
- "{'FVP', 'DA', 'FDP', 'BP', 'DP', 'DRP', 'PDS', 'SSW', 'Grüne', 'Fraktionslos', 'WAV', 'Gast', 'FU', 'KPD', 'DIE LINKE.', 'CDU/CSU', 'not found', 'GB/BHE', 'AfD', 'SPD', 'NR', 'Z'}\n"
71
- ]
72
- }
73
- ],
74
- "source": [
75
- "# Unique partys\n",
76
- "print(set(df['party'].to_list()))"
77
- ]
78
- },
79
- {
80
- "cell_type": "code",
81
- "execution_count": null,
82
- "metadata": {},
83
- "outputs": [
84
- {
85
- "data": {
86
- "text/html": [
87
- "<div>\n",
88
- "<style scoped>\n",
89
- " .dataframe tbody tr th:only-of-type {\n",
90
- " vertical-align: middle;\n",
91
- " }\n",
92
- "\n",
93
- " .dataframe tbody tr th {\n",
94
- " vertical-align: top;\n",
95
- " }\n",
96
- "\n",
97
- " .dataframe thead th {\n",
98
- " text-align: right;\n",
99
- " }\n",
100
- "</style>\n",
101
- "<table border=\"1\" class=\"dataframe\">\n",
102
- " <thead>\n",
103
- " <tr style=\"text-align: right;\">\n",
104
- " <th></th>\n",
105
- " <th>id</th>\n",
106
- " <th>speech_content</th>\n",
107
- " <th>date</th>\n",
108
- " <th>party</th>\n",
109
- " </tr>\n",
110
- " </thead>\n",
111
- " <tbody>\n",
112
- " <tr>\n",
113
- " <th>0</th>\n",
114
- " <td>0</td>\n",
115
- " <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
116
- " <td>1949-09-12</td>\n",
117
- " <td>not found</td>\n",
118
- " </tr>\n",
119
- " <tr>\n",
120
- " <th>1</th>\n",
121
- " <td>1</td>\n",
122
- " <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
123
- " <td>1949-09-12</td>\n",
124
- " <td>not found</td>\n",
125
- " </tr>\n",
126
- " <tr>\n",
127
- " <th>2</th>\n",
128
- " <td>2</td>\n",
129
- " <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
130
- " <td>1949-09-12</td>\n",
131
- " <td>not found</td>\n",
132
- " </tr>\n",
133
- " <tr>\n",
134
- " <th>3</th>\n",
135
- " <td>3</td>\n",
136
- " <td>Ja, ich habe den Wunsch.\\n</td>\n",
137
- " <td>1949-09-12</td>\n",
138
- " <td>not found</td>\n",
139
- " </tr>\n",
140
- " <tr>\n",
141
- " <th>4</th>\n",
142
- " <td>4</td>\n",
143
- " <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
144
- " <td>1949-09-12</td>\n",
145
- " <td>not found</td>\n",
146
- " </tr>\n",
147
- " <tr>\n",
148
- " <th>...</th>\n",
149
- " <td>...</td>\n",
150
- " <td>...</td>\n",
151
- " <td>...</td>\n",
152
- " <td>...</td>\n",
153
- " </tr>\n",
154
- " <tr>\n",
155
- " <th>930955</th>\n",
156
- " <td>1084268</td>\n",
157
- " <td>\\n\\nWir sind zwar Kollegen.</td>\n",
158
- " <td>2022-12-16</td>\n",
159
- " <td>not found</td>\n",
160
- " </tr>\n",
161
- " <tr>\n",
162
- " <th>930956</th>\n",
163
- " <td>1084269</td>\n",
164
- " <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
165
- " <td>2022-12-16</td>\n",
166
- " <td>CDU/CSU</td>\n",
167
- " </tr>\n",
168
- " <tr>\n",
169
- " <th>930957</th>\n",
170
- " <td>1084270</td>\n",
171
- " <td>\\n\\nVielen Dank.</td>\n",
172
- " <td>2022-12-16</td>\n",
173
- " <td>not found</td>\n",
174
- " </tr>\n",
175
- " <tr>\n",
176
- " <th>930958</th>\n",
177
- " <td>1084272</td>\n",
178
- " <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
179
- " <td>2022-12-16</td>\n",
180
- " <td>not found</td>\n",
181
- " </tr>\n",
182
- " <tr>\n",
183
- " <th>930959</th>\n",
184
- " <td>1084273</td>\n",
185
- " <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
186
- " <td>2022-12-16</td>\n",
187
- " <td>SPD</td>\n",
188
- " </tr>\n",
189
- " </tbody>\n",
190
- "</table>\n",
191
- "<p>930960 rows × 4 columns</p>\n",
192
- "</div>"
193
- ],
194
- "text/plain": [
195
- " id speech_content \\\n",
196
- "0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... \n",
197
- "1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n \n",
198
- "2 2 Ich danke für diese Erklärung. Ich stelle dami... \n",
199
- "3 3 Ja, ich habe den Wunsch.\\n \n",
200
- "4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... \n",
201
- "... ... ... \n",
202
- "930955 1084268 \\n\\nWir sind zwar Kollegen. \n",
203
- "930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! \n",
204
- "930957 1084270 \\n\\nVielen Dank. \n",
205
- "930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... \n",
206
- "930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... \n",
207
- "\n",
208
- " date party \n",
209
- "0 1949-09-12 not found \n",
210
- "1 1949-09-12 not found \n",
211
- "2 1949-09-12 not found \n",
212
- "3 1949-09-12 not found \n",
213
- "4 1949-09-12 not found \n",
214
- "... ... ... \n",
215
- "930955 2022-12-16 not found \n",
216
- "930956 2022-12-16 CDU/CSU \n",
217
- "930957 2022-12-16 not found \n",
218
- "930958 2022-12-16 not found \n",
219
- "930959 2022-12-16 SPD \n",
220
- "\n",
221
- "[930960 rows x 4 columns]"
222
- ]
223
- },
224
- "execution_count": 16,
225
- "metadata": {},
226
- "output_type": "execute_result"
227
- }
228
- ],
229
- "source": [
230
- "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
231
- "df['date'] = pd.to_datetime(df['date'])\n",
232
- "df"
233
- ]
234
- },
235
- {
236
- "cell_type": "code",
237
- "execution_count": null,
238
- "metadata": {},
239
- "outputs": [],
240
- "source": [
241
- "# Dave to pickle\n",
242
- "df.to_pickle(\"speeches_1949_09_12\")"
243
- ]
244
- }
245
- ],
246
- "metadata": {
247
- "kernelspec": {
248
- "display_name": "Python 3",
249
- "language": "python",
250
- "name": "python3"
251
- },
252
- "language_info": {
253
- "codemirror_mode": {
254
- "name": "ipython",
255
- "version": 3
256
- },
257
- "file_extension": ".py",
258
- "mimetype": "text/x-python",
259
- "name": "python",
260
- "nbconvert_exporter": "python",
261
- "pygments_lexer": "ipython3",
262
- "version": "3.11.4"
263
- }
264
- },
265
- "nbformat": 4,
266
- "nbformat_minor": 2
267
- }