Spaces:

TomData
/

PoliticsToYou

Running

App Files Files Community

TomData commited on 29 days ago

Commit

38886c5

1 Parent(s): 6d01321

Change repo structure to adapt to HF new space GB limit

Browse files

Files changed (3) hide show

Home.py +7 -0
src/{FAISS.ipynb → FAISS/FAISS.ipynb} +63 -3
src/Speeches/query.ipynb +0 -267

Home.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from src.chatbot import chatbot, keyword_search
 #from gradio_calendar import Calendar
 #from datetime import datetime
@@ -13,6 +14,11 @@ from src.chatbot import chatbot, keyword_search
 # login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
 # Define important variables
 legislature_periods = [
@@ -42,6 +48,7 @@ legislature_periods = [
 partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
 with gr.Blocks() as App:
     with gr.Tab("ChatBot"):

 import gradio as gr
 from src.chatbot import chatbot, keyword_search
+from huggingface_hub import snapshot_download
 #from gradio_calendar import Calendar
 #from datetime import datetime
 # login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
+# Retrieve Vectorstore
+REPO_ID = "TomData/test"
+LOCAL_DIR = "src/FAISS"
+snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_DIR, repo_type="dataset")
 # Define important variables
 legislature_periods = [
 partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
+# Define Gradio App Layout
 with gr.Blocks() as App:
     with gr.Tab("ChatBot"):

src/{FAISS.ipynb → FAISS/FAISS.ipynb} RENAMED Viewed

@@ -2,11 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
     "from langchain_community.document_loaders import DataFrameLoader\n",
     "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
@@ -19,7 +21,58 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Load the whole speeches data"
    ]
   },
   {
@@ -173,7 +226,7 @@
     }
    ],
    "source": [
-    "df = pd.read_pickle(r\"C:\\Users\\Tom\\OneDrive\\Dokumente\\Lokal\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
     "df['date'] = pd.to_datetime(df['date'])\n"
    ]
   },
@@ -304,6 +357,13 @@
     "\n",
     "    \n"
    ]
   }
  ],
  "metadata": {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
+    "import psycopg2\n",
+    "\n",
     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
     "from langchain_community.document_loaders import DataFrameLoader\n",
     "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "### Retrieve Speeches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# db_connection -----------------------------------------------------------\n",
+    "con_details = {\n",
+    "    \"host\"      : \"localhost\",\n",
+    "    \"database\"  : \"next\",\n",
+    "    \"user\"      : \"postgres\",\n",
+    "    \"password\"  : \"postgres\",\n",
+    "    \"port\"      : \"5433\"\n",
+    "}\n",
+    "con = psycopg2.connect(**con_details)\n",
+    "\n",
+    "# get data tables ---------------------------------------------------------\n",
+    "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
+    "                        FROM open_discourse.speeches AS s\n",
+    "                        INNER JOIN open_discourse.factions AS f ON\n",
+    "                        s.faction_id = f.id;\"\"\", con)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Process speeches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(set(df['party'].to_list()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Removing keys from interruptions of a speech\n",
+    "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) \n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
+    "df"
    ]
   },
   {
     }
    ],
    "source": [
+    "# Convert to proper time format\n",
     "df['date'] = pd.to_datetime(df['date'])\n"
    ]
   },
     "\n",
     "    \n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This data has been uploaded to: https://huggingface.co/datasets/TomData/test"
+   ]
   }
  ],
  "metadata": {

src/Speeches/query.ipynb DELETED Viewed

@@ -1,267 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import psycopg2\n",
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Pandas\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_12368\\2515868855.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
-      "  df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
-     ]
-    }
-   ],
-   "source": [
-    "# db_connection -----------------------------------------------------------\n",
-    "con_details = {\n",
-    "    \"host\"      : \"localhost\",\n",
-    "    \"database\"  : \"next\",\n",
-    "    \"user\"      : \"postgres\",\n",
-    "    \"password\"  : \"postgres\",\n",
-    "    \"port\"      : \"5433\"\n",
-    "}\n",
-    "con = psycopg2.connect(**con_details)\n",
-    "\n",
-    "# get data tables ---------------------------------------------------------\n",
-    "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
-    "                        FROM open_discourse.speeches AS s\n",
-    "                        INNER JOIN open_discourse.factions AS f ON\n",
-    "                        s.faction_id = f.id;\"\"\", con)\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data Cleaning"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'FVP', 'DA', 'FDP', 'BP', 'DP', 'DRP', 'PDS', 'SSW', 'Grüne', 'Fraktionslos', 'WAV', 'Gast', 'FU', 'KPD', 'DIE LINKE.', 'CDU/CSU', 'not found', 'GB/BHE', 'AfD', 'SPD', 'NR', 'Z'}\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Unique partys\n",
-    "print(set(df['party'].to_list()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>speech_content</th>\n",
-       "      <th>date</th>\n",
-       "      <th>party</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
-       "      <td>1949-09-12</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
-       "      <td>1949-09-12</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
-       "      <td>1949-09-12</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Ja, ich habe den Wunsch.\\n</td>\n",
-       "      <td>1949-09-12</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
-       "      <td>1949-09-12</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>930955</th>\n",
-       "      <td>1084268</td>\n",
-       "      <td>\\n\\nWir sind zwar Kollegen.</td>\n",
-       "      <td>2022-12-16</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>930956</th>\n",
-       "      <td>1084269</td>\n",
-       "      <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
-       "      <td>2022-12-16</td>\n",
-       "      <td>CDU/CSU</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>930957</th>\n",
-       "      <td>1084270</td>\n",
-       "      <td>\\n\\nVielen Dank.</td>\n",
-       "      <td>2022-12-16</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>930958</th>\n",
-       "      <td>1084272</td>\n",
-       "      <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
-       "      <td>2022-12-16</td>\n",
-       "      <td>not found</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>930959</th>\n",
-       "      <td>1084273</td>\n",
-       "      <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
-       "      <td>2022-12-16</td>\n",
-       "      <td>SPD</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>930960 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             id                                     speech_content  \\\n",
-       "0             0  Meine Damen und Herren! Ich eröffne die 2. Sit...   \n",
-       "1             1    Der Bundesrat ist versammelt, Herr Präsident.\\n   \n",
-       "2             2  Ich danke für diese Erklärung. Ich stelle dami...   \n",
-       "3             3                         Ja, ich habe den Wunsch.\\n   \n",
-       "4             4  Ich erteile dem Herrn Bundespräsidenten das Wo...   \n",
-       "...         ...                                                ...   \n",
-       "930955  1084268                        \\n\\nWir sind zwar Kollegen.   \n",
-       "930956  1084269          \\n\\nLiebe, sehr geehrte Frau Präsidentin!   \n",
-       "930957  1084270                                   \\n\\nVielen Dank.   \n",
-       "930958  1084272  \\n\\nDen Abschluss dieser Aktuellen Stunde bild...   \n",
-       "930959  1084273  \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...   \n",
-       "\n",
-       "              date      party  \n",
-       "0       1949-09-12  not found  \n",
-       "1       1949-09-12  not found  \n",
-       "2       1949-09-12  not found  \n",
-       "3       1949-09-12  not found  \n",
-       "4       1949-09-12  not found  \n",
-       "...            ...        ...  \n",
-       "930955  2022-12-16  not found  \n",
-       "930956  2022-12-16    CDU/CSU  \n",
-       "930957  2022-12-16  not found  \n",
-       "930958  2022-12-16  not found  \n",
-       "930959  2022-12-16        SPD  \n",
-       "\n",
-       "[930960 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
-    "df['date'] = pd.to_datetime(df['date'])\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Dave to pickle\n",
-    "df.to_pickle(\"speeches_1949_09_12\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}