forestav commited on
Commit
0cea0f3
·
1 Parent(s): a6fce6a

update cron job

Browse files
.github/workflows/actions.yml CHANGED
@@ -1,15 +1,15 @@
1
  name: Update Job Database
2
-
3
  on:
4
  schedule:
5
- - cron: "0 0 * * *" # Runs once a day at midnight
 
6
  workflow_dispatch: # Allows manual triggering
7
 
8
  permissions:
9
  contents: write
10
 
11
  jobs:
12
- update-database:
13
  runs-on: ubuntu-latest
14
  steps:
15
  - name: Checkout repository
@@ -32,7 +32,7 @@ jobs:
32
  with:
33
  timezoneLinux: "Europe/Stockholm"
34
 
35
- - name: Run update script
36
  env:
37
  PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
38
  run: python main.py
@@ -43,3 +43,40 @@ jobs:
43
  git config --local user.name "github-actions[bot]"
44
  git add timestamp2.txt
45
  git diff --quiet && git diff --staged --quiet || (git commit -m "Update timestamp" && git push)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  name: Update Job Database
 
2
  on:
3
  schedule:
4
+ - cron: "0 0 * * *" # Daily at midnight (main.py)
5
+ - cron: "0 0 * * 0" # Weekly on Sunday at midnight (training_pipeline.ipynb)
6
  workflow_dispatch: # Allows manual triggering
7
 
8
  permissions:
9
  contents: write
10
 
11
  jobs:
12
+ daily-update:
13
  runs-on: ubuntu-latest
14
  steps:
15
  - name: Checkout repository
 
32
  with:
33
  timezoneLinux: "Europe/Stockholm"
34
 
35
+ - name: Run daily update script
36
  env:
37
  PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
38
  run: python main.py
 
43
  git config --local user.name "github-actions[bot]"
44
  git add timestamp2.txt
45
  git diff --quiet && git diff --staged --quiet || (git commit -m "Update timestamp" && git push)
46
+
47
+ weekly-training:
48
+ runs-on: ubuntu-latest
49
+ if: github.event.schedule == '0 0 * * 0' # Only run on weekly schedule
50
+ steps:
51
+ - name: Checkout repository
52
+ uses: actions/checkout@v3
53
+ with:
54
+ token: ${{ secrets.GITHUB_TOKEN }}
55
+
56
+ - name: Set up Python
57
+ uses: actions/setup-python@v4
58
+ with:
59
+ python-version: "3.10"
60
+
61
+ - name: Install dependencies
62
+ run: |
63
+ python -m pip install --upgrade pip
64
+ pip install -r requirements.txt
65
+ pip install nbconvert jupyter
66
+
67
+ - name: Run training pipeline
68
+ env:
69
+ HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
70
+ run: |
71
+ jupyter nbconvert --to python training_pipeline.ipynb
72
+ python training_pipeline.py
73
+
74
+ - name: Run bootstrap script
75
+ run: python bootstrap.py
76
+
77
+ - name: Commit and push changes
78
+ run: |
79
+ git config --local user.email "github-actions[bot]@users.noreply.github.com"
80
+ git config --local user.name "github-actions[bot]"
81
+ git add .
82
+ git diff --quiet && git diff --staged --quiet || (git commit -m "Weekly training update" && git push)
bootstrap.py CHANGED
@@ -16,6 +16,7 @@ if __name__ == '__main__':
16
  """
17
  # Initialize Pinecone handler
18
  handler = PineconeHandler()
 
19
  log.info('Pinecone connection initialized')
20
 
21
  if PLACES or OCCUPATIONS:
 
16
  """
17
  # Initialize Pinecone handler
18
  handler = PineconeHandler()
19
+ handler.recreate_index()
20
  log.info('Pinecone connection initialized')
21
 
22
  if PLACES or OCCUPATIONS:
feature_pipeline.ipynb DELETED
@@ -1,282 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 3,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import hopsworks\n",
10
- "import os\n",
11
- "import re\n",
12
- "from dotenv import load_dotenv"
13
- ]
14
- },
15
- {
16
- "cell_type": "code",
17
- "execution_count": 5,
18
- "metadata": {},
19
- "outputs": [
20
- {
21
- "name": "stdout",
22
- "output_type": "stream",
23
- "text": [
24
- "2025-01-08 19:51:38,754 INFO: Closing external client and cleaning up certificates.\n",
25
- "Connection closed.\n",
26
- "2025-01-08 19:51:38,758 INFO: Initializing external client\n",
27
- "2025-01-08 19:51:38,758 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
28
- "2025-01-08 19:51:39,828 INFO: Python Engine initialized.\n",
29
- "\n",
30
- "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n"
31
- ]
32
- }
33
- ],
34
- "source": [
35
- "load_dotenv()\n",
36
- "\n",
37
- "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n",
38
- "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)"
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": 5,
44
- "metadata": {},
45
- "outputs": [],
46
- "source": [
47
- "fs = project.get_feature_store()"
48
- ]
49
- },
50
- {
51
- "cell_type": "code",
52
- "execution_count": 6,
53
- "metadata": {},
54
- "outputs": [],
55
- "source": [
56
- "# Retrieve feature groups\n",
57
- "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)"
58
- ]
59
- },
60
- {
61
- "cell_type": "code",
62
- "execution_count": 24,
63
- "metadata": {},
64
- "outputs": [
65
- {
66
- "name": "stdout",
67
- "output_type": "stream",
68
- "text": [
69
- "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.93s) \n"
70
- ]
71
- }
72
- ],
73
- "source": [
74
- "feedback_df = feedback_fg.read()"
75
- ]
76
- },
77
- {
78
- "cell_type": "code",
79
- "execution_count": 14,
80
- "metadata": {},
81
- "outputs": [
82
- {
83
- "data": {
84
- "text/html": [
85
- "<div>\n",
86
- "<style scoped>\n",
87
- " .dataframe tbody tr th:only-of-type {\n",
88
- " vertical-align: middle;\n",
89
- " }\n",
90
- "\n",
91
- " .dataframe tbody tr th {\n",
92
- " vertical-align: top;\n",
93
- " }\n",
94
- "\n",
95
- " .dataframe thead th {\n",
96
- " text-align: right;\n",
97
- " }\n",
98
- "</style>\n",
99
- "<table border=\"1\" class=\"dataframe\">\n",
100
- " <thead>\n",
101
- " <tr style=\"text-align: right;\">\n",
102
- " <th></th>\n",
103
- " <th>job_id</th>\n",
104
- " <th>resume_text</th>\n",
105
- " <th>job_headline</th>\n",
106
- " <th>job_occupation</th>\n",
107
- " <th>job_description</th>\n",
108
- " <th>is_relevant</th>\n",
109
- " </tr>\n",
110
- " </thead>\n",
111
- " <tbody>\n",
112
- " <tr>\n",
113
- " <th>0</th>\n",
114
- " <td>29321628</td>\n",
115
- " <td>Filip Orestav \\nTransformatorvägen 6, Sollent...</td>\n",
116
- " <td>Junior Projektadmin till talangprogram på AFRY...</td>\n",
117
- " <td>Projektledare, bygg och anläggning</td>\n",
118
- " <td>Vill du kickstarta din karriär hos en av Sveri...</td>\n",
119
- " <td>True</td>\n",
120
- " </tr>\n",
121
- " </tbody>\n",
122
- "</table>\n",
123
- "</div>"
124
- ],
125
- "text/plain": [
126
- " job_id resume_text \\\n",
127
- "0 29321628 Filip Orestav \\nTransformatorvägen 6, Sollent... \n",
128
- "\n",
129
- " job_headline \\\n",
130
- "0 Junior Projektadmin till talangprogram på AFRY... \n",
131
- "\n",
132
- " job_occupation \\\n",
133
- "0 Projektledare, bygg och anläggning \n",
134
- "\n",
135
- " job_description is_relevant \n",
136
- "0 Vill du kickstarta din karriär hos en av Sveri... True "
137
- ]
138
- },
139
- "execution_count": 14,
140
- "metadata": {},
141
- "output_type": "execute_result"
142
- }
143
- ],
144
- "source": [
145
- "feedback_df.head()"
146
- ]
147
- },
148
- {
149
- "cell_type": "code",
150
- "execution_count": 25,
151
- "metadata": {},
152
- "outputs": [],
153
- "source": [
154
- "# Columns to preprocess\n",
155
- "columns_to_process = ['resume_text', 'job_headline', 'job_occupation', 'job_description']"
156
- ]
157
- },
158
- {
159
- "cell_type": "code",
160
- "execution_count": 26,
161
- "metadata": {},
162
- "outputs": [],
163
- "source": [
164
- "# Define preprocessing functions\n",
165
- "def preprocess_text(text):\n",
166
- " if isinstance(text, str):\n",
167
- " # Lowercase\n",
168
- " text = text.lower()\n",
169
- " # Remove special characters (preserving letters, numbers, and spaces)\n",
170
- " text = re.sub(r\"[^a-zåäöA-Z0-9\\s]\", \"\", text)\n",
171
- " # Remove extra spaces\n",
172
- " text = re.sub(r\"\\s+\", \" \", text)\n",
173
- " return text.strip() # Strip leading/trailing spaces\n",
174
- " return text"
175
- ]
176
- },
177
- {
178
- "cell_type": "code",
179
- "execution_count": 28,
180
- "metadata": {},
181
- "outputs": [
182
- {
183
- "name": "stdout",
184
- "output_type": "stream",
185
- "text": [
186
- "2025-01-08 18:38:35,968 WARNING: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
187
- "\n"
188
- ]
189
- },
190
- {
191
- "data": {
192
- "text/html": [
193
- "<div>\n",
194
- "<style scoped>\n",
195
- " .dataframe tbody tr th:only-of-type {\n",
196
- " vertical-align: middle;\n",
197
- " }\n",
198
- "\n",
199
- " .dataframe tbody tr th {\n",
200
- " vertical-align: top;\n",
201
- " }\n",
202
- "\n",
203
- " .dataframe thead th {\n",
204
- " text-align: right;\n",
205
- " }\n",
206
- "</style>\n",
207
- "<table border=\"1\" class=\"dataframe\">\n",
208
- " <thead>\n",
209
- " <tr style=\"text-align: right;\">\n",
210
- " <th></th>\n",
211
- " <th>job_id</th>\n",
212
- " <th>resume_text</th>\n",
213
- " <th>job_headline</th>\n",
214
- " <th>job_occupation</th>\n",
215
- " <th>job_description</th>\n",
216
- " <th>is_relevant</th>\n",
217
- " </tr>\n",
218
- " </thead>\n",
219
- " <tbody>\n",
220
- " <tr>\n",
221
- " <th>0</th>\n",
222
- " <td>29321628</td>\n",
223
- " <td>filip orestav transformatorvägen 6 sollentuna ...</td>\n",
224
- " <td>junior projektadmin till talangprogram på afry...</td>\n",
225
- " <td>projektledare bygg och anläggning</td>\n",
226
- " <td>vill du kickstarta din karriär hos en av sveri...</td>\n",
227
- " <td>True</td>\n",
228
- " </tr>\n",
229
- " </tbody>\n",
230
- "</table>\n",
231
- "</div>"
232
- ],
233
- "text/plain": [
234
- " job_id resume_text \\\n",
235
- "0 29321628 filip orestav transformatorvägen 6 sollentuna ... \n",
236
- "\n",
237
- " job_headline \\\n",
238
- "0 junior projektadmin till talangprogram på afry... \n",
239
- "\n",
240
- " job_occupation \\\n",
241
- "0 projektledare bygg och anläggning \n",
242
- "\n",
243
- " job_description is_relevant \n",
244
- "0 vill du kickstarta din karriär hos en av sveri... True "
245
- ]
246
- },
247
- "execution_count": 28,
248
- "metadata": {},
249
- "output_type": "execute_result"
250
- }
251
- ],
252
- "source": [
253
- "# Apply preprocessing\n",
254
- "feedback_df[columns_to_process] = feedback_df[columns_to_process].applymap(preprocess_text)\n",
255
- "\n",
256
- "# Display processed dataframe\n",
257
- "feedback_df.head()"
258
- ]
259
- }
260
- ],
261
- "metadata": {
262
- "kernelspec": {
263
- "display_name": "venv",
264
- "language": "python",
265
- "name": "python3"
266
- },
267
- "language_info": {
268
- "codemirror_mode": {
269
- "name": "ipython",
270
- "version": 3
271
- },
272
- "file_extension": ".py",
273
- "mimetype": "text/x-python",
274
- "name": "python",
275
- "nbconvert_exporter": "python",
276
- "pygments_lexer": "ipython3",
277
- "version": "3.12.2"
278
- }
279
- },
280
- "nbformat": 4,
281
- "nbformat_minor": 2
282
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pinecone_handler.py CHANGED
@@ -199,6 +199,29 @@ class PineconeHandler:
199
  filter=metadata_filter if metadata_filter else None
200
  )
201
  return results.matches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  def load_all(all_ads):
204
  handler = PineconeHandler()
 
199
  filter=metadata_filter if metadata_filter else None
200
  )
201
  return results.matches
202
+
203
+ def recreate_index(self) -> None:
204
+ """Recreate the Pinecone index"""
205
+ try:
206
+ self.pc.delete_index(PINECONE_INDEX_NAME)
207
+ log.info(f"Deleted index '{PINECONE_INDEX_NAME}'")
208
+
209
+ log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
210
+ spec = ServerlessSpec(
211
+ cloud="aws",
212
+ region="us-east-1"
213
+ )
214
+
215
+ self.pc.create_index(
216
+ name=PINECONE_INDEX_NAME,
217
+ dimension=384,
218
+ metric="cosine",
219
+ spec=spec
220
+ )
221
+ self.index = self.pc.Index(PINECONE_INDEX_NAME)
222
+ log.info(f"Connected to new index '{PINECONE_INDEX_NAME}'")
223
+ except Exception as e:
224
+ log.error(f"Error deleting index: {str(e)}")
225
 
226
  def load_all(all_ads):
227
  handler = PineconeHandler()
timestamp2.txt CHANGED
@@ -1 +1 @@
1
- 2025-01-08T02:29:02
 
1
+ 2025-01-08T21:34:31