daqc commited on
Commit
aafb42f
·
verified ·
1 Parent(s): 095939c

Delete scripts

Browse files
scripts/cookies.py DELETED
@@ -1,715 +0,0 @@
1
- from requests.cookies import RequestsCookieJar
2
-
3
-
4
- COOKIES_LIST = [
5
- {
6
- "domain": ".youtube.com",
7
- "expirationDate": 1718884961,
8
- "hostOnly": False,
9
- "httpOnly": False,
10
- "name": "ST-xuwub9",
11
- "path": "/",
12
- "sameSite": None,
13
- "secure": False,
14
- "session": False,
15
- "storeId": None,
16
- "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
- },
18
- {
19
- "domain": ".youtube.com",
20
- "expirationDate": 1753004444.745411,
21
- "hostOnly": False,
22
- "httpOnly": True,
23
- "name": "__Secure-YEC",
24
- "path": "/",
25
- "sameSite": "lax",
26
- "secure": True,
27
- "session": False,
28
- "storeId": None,
29
- "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
- },
31
- {
32
- "domain": ".youtube.com",
33
- "expirationDate": 1753434620.050824,
34
- "hostOnly": False,
35
- "httpOnly": True,
36
- "name": "__Secure-3PSID",
37
- "path": "/",
38
- "sameSite": "no_restriction",
39
- "secure": True,
40
- "session": False,
41
- "storeId": None,
42
- "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
- },
44
- {
45
- "domain": ".youtube.com",
46
- "expirationDate": 1750420959.974642,
47
- "hostOnly": False,
48
- "httpOnly": False,
49
- "name": "SIDCC",
50
- "path": "/",
51
- "sameSite": None,
52
- "secure": False,
53
- "session": False,
54
- "storeId": None,
55
- "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
- },
57
- {
58
- "domain": ".youtube.com",
59
- "expirationDate": 1753434620.050652,
60
- "hostOnly": False,
61
- "httpOnly": False,
62
- "name": "SID",
63
- "path": "/",
64
- "sameSite": None,
65
- "secure": False,
66
- "session": False,
67
- "storeId": None,
68
- "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
- },
70
- {
71
- "domain": ".youtube.com",
72
- "expirationDate": 1750420958.397534,
73
- "hostOnly": False,
74
- "httpOnly": True,
75
- "name": "__Secure-1PSIDTS",
76
- "path": "/",
77
- "sameSite": None,
78
- "secure": True,
79
- "session": False,
80
- "storeId": None,
81
- "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
- },
83
- {
84
- "domain": ".youtube.com",
85
- "expirationDate": 1753433494.44729,
86
- "hostOnly": False,
87
- "httpOnly": False,
88
- "name": "_ga_M0180HEFCY",
89
- "path": "/",
90
- "sameSite": None,
91
- "secure": False,
92
- "session": False,
93
- "storeId": None,
94
- "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
- },
96
- {
97
- "domain": ".youtube.com",
98
- "expirationDate": 1753434620.050933,
99
- "hostOnly": False,
100
- "httpOnly": False,
101
- "name": "SAPISID",
102
- "path": "/",
103
- "sameSite": None,
104
- "secure": True,
105
- "session": False,
106
- "storeId": None,
107
- "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
- },
109
- {
110
- "domain": ".youtube.com",
111
- "expirationDate": 1750420959.974764,
112
- "hostOnly": False,
113
- "httpOnly": True,
114
- "name": "__Secure-1PSIDCC",
115
- "path": "/",
116
- "sameSite": None,
117
- "secure": True,
118
- "session": False,
119
- "storeId": None,
120
- "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
- },
122
- {
123
- "domain": ".youtube.com",
124
- "expirationDate": 1753434620.050881,
125
- "hostOnly": False,
126
- "httpOnly": True,
127
- "name": "SSID",
128
- "path": "/",
129
- "sameSite": None,
130
- "secure": True,
131
- "session": False,
132
- "storeId": None,
133
- "value": "AmlwXHnQvOQ10LVd-",
134
- },
135
- {
136
- "domain": ".youtube.com",
137
- "expirationDate": 1753434620.050959,
138
- "hostOnly": False,
139
- "httpOnly": False,
140
- "name": "__Secure-1PAPISID",
141
- "path": "/",
142
- "sameSite": None,
143
- "secure": True,
144
- "session": False,
145
- "storeId": None,
146
- "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
- },
148
- {
149
- "domain": ".youtube.com",
150
- "expirationDate": 1753434620.050795,
151
- "hostOnly": False,
152
- "httpOnly": True,
153
- "name": "__Secure-1PSID",
154
- "path": "/",
155
- "sameSite": None,
156
- "secure": True,
157
- "session": False,
158
- "storeId": None,
159
- "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
- },
161
- {
162
- "domain": ".youtube.com",
163
- "expirationDate": 1753434620.050993,
164
- "hostOnly": False,
165
- "httpOnly": False,
166
- "name": "__Secure-3PAPISID",
167
- "path": "/",
168
- "sameSite": "no_restriction",
169
- "secure": True,
170
- "session": False,
171
- "storeId": None,
172
- "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
- },
174
- {
175
- "domain": ".youtube.com",
176
- "expirationDate": 1750420959.974815,
177
- "hostOnly": False,
178
- "httpOnly": True,
179
- "name": "__Secure-3PSIDCC",
180
- "path": "/",
181
- "sameSite": "no_restriction",
182
- "secure": True,
183
- "session": False,
184
- "storeId": None,
185
- "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
- },
187
- {
188
- "domain": ".youtube.com",
189
- "expirationDate": 1750420958.397647,
190
- "hostOnly": False,
191
- "httpOnly": True,
192
- "name": "__Secure-3PSIDTS",
193
- "path": "/",
194
- "sameSite": "no_restriction",
195
- "secure": True,
196
- "session": False,
197
- "storeId": None,
198
- "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
- },
200
- {
201
- "domain": ".youtube.com",
202
- "expirationDate": 1753434620.050908,
203
- "hostOnly": False,
204
- "httpOnly": False,
205
- "name": "APISID",
206
- "path": "/",
207
- "sameSite": None,
208
- "secure": False,
209
- "session": False,
210
- "storeId": None,
211
- "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
- },
213
- {
214
- "domain": ".youtube.com",
215
- "expirationDate": 1753434620.050855,
216
- "hostOnly": False,
217
- "httpOnly": True,
218
- "name": "HSID",
219
- "path": "/",
220
- "sameSite": None,
221
- "secure": False,
222
- "session": False,
223
- "storeId": None,
224
- "value": "AasA7hmRuTFv7vjoq",
225
- },
226
- {
227
- "domain": ".youtube.com",
228
- "expirationDate": 1753435873.577793,
229
- "hostOnly": False,
230
- "httpOnly": True,
231
- "name": "LOGIN_INFO",
232
- "path": "/",
233
- "sameSite": "no_restriction",
234
- "secure": True,
235
- "session": False,
236
- "storeId": None,
237
- "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
- },
239
- {
240
- "domain": ".youtube.com",
241
- "expirationDate": 1753444956.555608,
242
- "hostOnly": False,
243
- "httpOnly": False,
244
- "name": "PREF",
245
- "path": "/",
246
- "sameSite": None,
247
- "secure": True,
248
- "session": False,
249
- "storeId": None,
250
- "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
- },
252
- ]
253
-
254
- COOKIES_LIST += [
255
- {
256
- "domain": ".www.researchgate.net",
257
- "hostOnly": False,
258
- "httpOnly": True,
259
- "name": "isInstIp",
260
- "path": "/",
261
- "sameSite": None,
262
- "secure": True,
263
- "session": True,
264
- "storeId": None,
265
- "value": "False",
266
- },
267
- {
268
- "domain": ".researchgate.net",
269
- "expirationDate": 1734423981,
270
- "hostOnly": False,
271
- "httpOnly": False,
272
- "name": "__eoi",
273
- "path": "/",
274
- "sameSite": None,
275
- "secure": False,
276
- "session": False,
277
- "storeId": None,
278
- "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
- },
280
- {
281
- "domain": ".www.researchgate.net",
282
- "expirationDate": 1753444909.646103,
283
- "hostOnly": False,
284
- "httpOnly": True,
285
- "name": "ptc",
286
- "path": "/",
287
- "sameSite": None,
288
- "secure": True,
289
- "session": False,
290
- "storeId": None,
291
- "value": "RG1.8947708639250500550.1718872043",
292
- },
293
- {
294
- "domain": ".researchgate.net",
295
- "expirationDate": 1750507578,
296
- "hostOnly": False,
297
- "httpOnly": False,
298
- "name": "euconsent-v2-didomi",
299
- "path": "/",
300
- "sameSite": "lax",
301
- "secure": True,
302
- "session": False,
303
- "storeId": None,
304
- "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
- },
306
- {
307
- "domain": ".researchgate.net",
308
- "expirationDate": 1718885236,
309
- "hostOnly": False,
310
- "httpOnly": False,
311
- "name": "_gat",
312
- "path": "/",
313
- "sameSite": None,
314
- "secure": False,
315
- "session": False,
316
- "storeId": None,
317
- "value": "1",
318
- },
319
- {
320
- "domain": "www.researchgate.net",
321
- "expirationDate": 1721477183,
322
- "hostOnly": True,
323
- "httpOnly": False,
324
- "name": "_pbjs_userid_consent_data",
325
- "path": "/",
326
- "sameSite": "lax",
327
- "secure": False,
328
- "session": False,
329
- "storeId": None,
330
- "value": "3524755945110770",
331
- },
332
- {
333
- "domain": ".researchgate.net",
334
- "expirationDate": 1752567981,
335
- "hostOnly": False,
336
- "httpOnly": False,
337
- "name": "__gads",
338
- "path": "/",
339
- "sameSite": None,
340
- "secure": False,
341
- "session": False,
342
- "storeId": None,
343
- "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
- },
345
- {
346
- "domain": ".researchgate.net",
347
- "expirationDate": 1718886709.646173,
348
- "hostOnly": False,
349
- "httpOnly": True,
350
- "name": "__cf_bm",
351
- "path": "/",
352
- "sameSite": "no_restriction",
353
- "secure": True,
354
- "session": False,
355
- "storeId": None,
356
- "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
- },
358
- {
359
- "domain": ".researchgate.net",
360
- "expirationDate": 1752567981,
361
- "hostOnly": False,
362
- "httpOnly": False,
363
- "name": "__gpi",
364
- "path": "/",
365
- "sameSite": None,
366
- "secure": False,
367
- "session": False,
368
- "storeId": None,
369
- "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
- },
371
- {
372
- "domain": ".researchgate.net",
373
- "hostOnly": False,
374
- "httpOnly": True,
375
- "name": "_cfuvid",
376
- "path": "/",
377
- "sameSite": "no_restriction",
378
- "secure": True,
379
- "session": True,
380
- "storeId": None,
381
- "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
- },
383
- {
384
- "domain": ".researchgate.net",
385
- "expirationDate": 1753445177.271667,
386
- "hostOnly": False,
387
- "httpOnly": False,
388
- "name": "_ga",
389
- "path": "/",
390
- "sameSite": None,
391
- "secure": False,
392
- "session": False,
393
- "storeId": None,
394
- "value": "GA1.1.1525244793.1718885177",
395
- },
396
- {
397
- "domain": ".researchgate.net",
398
- "expirationDate": 1753445177.271482,
399
- "hostOnly": False,
400
- "httpOnly": False,
401
- "name": "_ga_4P31SJ70EJ",
402
- "path": "/",
403
- "sameSite": None,
404
- "secure": False,
405
- "session": False,
406
- "storeId": None,
407
- "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
- },
409
- {
410
- "domain": ".researchgate.net",
411
- "expirationDate": 1718971576,
412
- "hostOnly": False,
413
- "httpOnly": False,
414
- "name": "_gid",
415
- "path": "/",
416
- "sameSite": None,
417
- "secure": False,
418
- "session": False,
419
- "storeId": None,
420
- "value": "GA1.2.854907463.1718885177",
421
- },
422
- {
423
- "domain": ".www.researchgate.net",
424
- "expirationDate": 1750407982.506505,
425
- "hostOnly": False,
426
- "httpOnly": True,
427
- "name": "did",
428
- "path": "/",
429
- "sameSite": None,
430
- "secure": True,
431
- "session": False,
432
- "storeId": None,
433
- "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
- },
435
- {
436
- "domain": ".researchgate.net",
437
- "expirationDate": 1750507578,
438
- "hostOnly": False,
439
- "httpOnly": False,
440
- "name": "didomi_token",
441
- "path": "/",
442
- "sameSite": "lax",
443
- "secure": True,
444
- "session": False,
445
- "storeId": None,
446
- "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
- },
448
- {
449
- "domain": ".www.researchgate.net",
450
- "hostOnly": False,
451
- "httpOnly": True,
452
- "name": "hasPdpNext",
453
- "path": "/",
454
- "sameSite": None,
455
- "secure": True,
456
- "session": True,
457
- "storeId": None,
458
- "value": "False",
459
- },
460
- {
461
- "domain": ".researchgate.net",
462
- "expirationDate": 1750421183,
463
- "hostOnly": False,
464
- "httpOnly": False,
465
- "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
- "path": "/",
467
- "sameSite": "lax",
468
- "secure": True,
469
- "session": False,
470
- "storeId": None,
471
- "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
- },
473
- {
474
- "domain": ".www.researchgate.net",
475
- "hostOnly": False,
476
- "httpOnly": True,
477
- "name": "sid",
478
- "path": "/",
479
- "sameSite": None,
480
- "secure": True,
481
- "session": True,
482
- "storeId": None,
483
- "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
- },
485
- ]
486
-
487
- COOKIES_LIST += [
488
- {
489
- "domain": "github.com",
490
- "hostOnly": True,
491
- "httpOnly": True,
492
- "name": "_gh_sess",
493
- "path": "/",
494
- "sameSite": "lax",
495
- "secure": True,
496
- "session": True,
497
- "storeId": None,
498
- "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
- },
500
- {
501
- "domain": ".github.com",
502
- "expirationDate": 1750408875.763785,
503
- "hostOnly": False,
504
- "httpOnly": False,
505
- "name": "_octo",
506
- "path": "/",
507
- "sameSite": "lax",
508
- "secure": True,
509
- "session": False,
510
- "storeId": None,
511
- "value": "GH1.1.728652011.1718872875",
512
- },
513
- {
514
- "domain": ".github.com",
515
- "expirationDate": 1750408875.763926,
516
- "hostOnly": False,
517
- "httpOnly": True,
518
- "name": "logged_in",
519
- "path": "/",
520
- "sameSite": "lax",
521
- "secure": True,
522
- "session": False,
523
- "storeId": None,
524
- "value": "no",
525
- },
526
- {
527
- "domain": ".github.com",
528
- "hostOnly": False,
529
- "httpOnly": False,
530
- "name": "preferred_color_mode",
531
- "path": "/",
532
- "sameSite": "lax",
533
- "secure": True,
534
- "session": True,
535
- "storeId": None,
536
- "value": "dark",
537
- },
538
- {
539
- "domain": ".github.com",
540
- "hostOnly": False,
541
- "httpOnly": False,
542
- "name": "tz",
543
- "path": "/",
544
- "sameSite": "lax",
545
- "secure": True,
546
- "session": True,
547
- "storeId": None,
548
- "value": "Europe%2FParis",
549
- },
550
- ]
551
-
552
- COOKIES_LIST += [
553
- {
554
- "domain": ".web.archive.org",
555
- "expirationDate": 1718886430,
556
- "hostOnly": False,
557
- "httpOnly": False,
558
- "name": "_gat",
559
- "path": "/web/20201123221659/http://orcid.org/",
560
- "sameSite": None,
561
- "secure": False,
562
- "session": False,
563
- "storeId": None,
564
- "value": "1",
565
- },
566
- {
567
- "domain": ".web.archive.org",
568
- "expirationDate": 1718972770,
569
- "hostOnly": False,
570
- "httpOnly": False,
571
- "name": "_gid",
572
- "path": "/web/20201123221659/http://orcid.org/",
573
- "sameSite": None,
574
- "secure": False,
575
- "session": False,
576
- "storeId": None,
577
- "value": "GA1.2.402246368.1606169825",
578
- },
579
- {
580
- "domain": ".web.archive.org",
581
- "expirationDate": 1753446370.315621,
582
- "hostOnly": False,
583
- "httpOnly": False,
584
- "name": "_ga",
585
- "path": "/web/20201123221659/http://orcid.org/",
586
- "sameSite": None,
587
- "secure": False,
588
- "session": False,
589
- "storeId": None,
590
- "value": "GA1.2.1301409987.1606169825",
591
- },
592
- {
593
- "domain": ".web.archive.org",
594
- "expirationDate": 1750422367,
595
- "hostOnly": False,
596
- "httpOnly": False,
597
- "name": "_hjid",
598
- "path": "/web/20201123221659/http://orcid.org/",
599
- "sameSite": "lax",
600
- "secure": False,
601
- "session": False,
602
- "storeId": None,
603
- "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
- },
605
- {
606
- "domain": ".web.archive.org",
607
- "expirationDate": 1718888167,
608
- "hostOnly": False,
609
- "httpOnly": False,
610
- "name": "_hjFirstSeen",
611
- "path": "/web/20201123221659/http://orcid.org/",
612
- "sameSite": "lax",
613
- "secure": False,
614
- "session": False,
615
- "storeId": None,
616
- "value": "1",
617
- },
618
- ]
619
- COOKIES_LIST += [
620
- {
621
- "domain": "orcid.org",
622
- "hostOnly": True,
623
- "httpOnly": False,
624
- "name": "AWSELBCORS",
625
- "path": "/",
626
- "sameSite": "no_restriction",
627
- "secure": True,
628
- "session": True,
629
- "storeId": None,
630
- "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
- },
632
- {
633
- "domain": ".orcid.org",
634
- "expirationDate": 1753452454.637671,
635
- "hostOnly": False,
636
- "httpOnly": False,
637
- "name": "_ga_9R61FWK9H5",
638
- "path": "/",
639
- "sameSite": None,
640
- "secure": False,
641
- "session": False,
642
- "storeId": None,
643
- "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
- },
645
- {
646
- "domain": ".orcid.org",
647
- "expirationDate": 1753452454.63421,
648
- "hostOnly": False,
649
- "httpOnly": False,
650
- "name": "_ga",
651
- "path": "/",
652
- "sameSite": None,
653
- "secure": False,
654
- "session": False,
655
- "storeId": None,
656
- "value": "GA1.1.2021310691.1718892455",
657
- },
658
- {
659
- "domain": "orcid.org",
660
- "hostOnly": True,
661
- "httpOnly": False,
662
- "name": "AWSELB",
663
- "path": "/",
664
- "sameSite": None,
665
- "secure": False,
666
- "session": True,
667
- "storeId": None,
668
- "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
- },
670
- {
671
- "domain": ".orcid.org",
672
- "expirationDate": 1750428454,
673
- "hostOnly": False,
674
- "httpOnly": False,
675
- "name": "OptanonAlertBoxClosed",
676
- "path": "/",
677
- "sameSite": "lax",
678
- "secure": False,
679
- "session": False,
680
- "storeId": None,
681
- "value": "2024-06-20T14:07:34.583Z",
682
- },
683
- {
684
- "domain": ".orcid.org",
685
- "expirationDate": 1750428454,
686
- "hostOnly": False,
687
- "httpOnly": False,
688
- "name": "OptanonConsent",
689
- "path": "/",
690
- "sameSite": "lax",
691
- "secure": False,
692
- "session": False,
693
- "storeId": None,
694
- "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
- },
696
- {
697
- "domain": "orcid.org",
698
- "hostOnly": True,
699
- "httpOnly": False,
700
- "name": "XSRF-TOKEN",
701
- "path": "/",
702
- "sameSite": None,
703
- "secure": True,
704
- "session": True,
705
- "storeId": None,
706
- "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
- },
708
- ]
709
-
710
- # Create a RequestsCookieJar instance
711
- COOKIES = RequestsCookieJar()
712
-
713
- # Add cookies to the jar
714
- for cookie in COOKIES_LIST:
715
- COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/gaia_scorer.py DELETED
@@ -1,124 +0,0 @@
1
- import re
2
- import string
3
- import warnings
4
-
5
-
6
- def normalize_number_str(number_str: str) -> float:
7
- # we replace these common units and commas to allow
8
- # conversion to float
9
- for char in ["$", "%", ","]:
10
- number_str = number_str.replace(char, "")
11
- try:
12
- return float(number_str)
13
- except ValueError:
14
- print(f"String {number_str} cannot be normalized to number str.")
15
- return float("inf")
16
-
17
-
18
- def split_string(
19
- s: str,
20
- char_list: list[str] = [",", ";"],
21
- ) -> list[str]:
22
- pattern = f"[{''.join(char_list)}]"
23
- return re.split(pattern, s)
24
-
25
-
26
- def is_float(element: any) -> bool:
27
- try:
28
- float(element)
29
- return True
30
- except ValueError:
31
- return False
32
-
33
-
34
- def question_scorer(
35
- model_answer: str,
36
- ground_truth: str,
37
- ) -> bool:
38
- # if gt is a number
39
- if is_float(ground_truth):
40
- normalized_answer = normalize_number_str(str(model_answer))
41
- return normalized_answer == float(ground_truth)
42
-
43
- # if gt is a list
44
- elif any(char in ground_truth for char in [",", ";"]):
45
- # question with the fish: normalization removes punct
46
-
47
- gt_elems = split_string(ground_truth)
48
- ma_elems = split_string(model_answer)
49
-
50
- # check length is the same
51
- if len(gt_elems) != len(ma_elems):
52
- warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
53
- return False
54
-
55
- # compare each element as float or str
56
- comparisons = []
57
- for ma_elem, gt_elem in zip(ma_elems, gt_elems):
58
- if is_float(gt_elem):
59
- normalized_ma_elem = normalize_number_str(ma_elem)
60
- comparisons.append(normalized_ma_elem == float(gt_elem))
61
- else:
62
- # we do not remove punct since comparisons can include punct
63
- comparisons.append(
64
- normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
65
- )
66
- return all(comparisons)
67
-
68
- # if gt is a str
69
- else:
70
- return normalize_str(model_answer) == normalize_str(ground_truth)
71
-
72
-
73
- def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
74
- prediction = prediction.lower()
75
- true_answer = true_answer.lower()
76
- if len(prediction) > len(true_answer) * 3:
77
- return False
78
- i = 0
79
- for letter in true_answer:
80
- if letter in prediction[i:]:
81
- i += prediction[i:].index(letter)
82
- else:
83
- return False
84
- return True
85
-
86
-
87
- def check_close_call(prediction, true_answer, is_correct):
88
- if is_correct:
89
- return True
90
- else:
91
- if is_float(true_answer):
92
- return is_correct
93
- else:
94
- if (
95
- check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
96
- and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
97
- ):
98
- print(f"Close call: {prediction} vs {true_answer}")
99
- return True
100
- else:
101
- return False
102
-
103
-
104
- def normalize_str(input_str, remove_punct=True) -> str:
105
- """
106
- Normalize a string by:
107
- - Removing all white spaces
108
- - Optionally removing punctuation (if remove_punct is True)
109
- - Converting to lowercase
110
- Parameters:
111
- - input_str: str, the string to normalize
112
- - remove_punct: bool, whether to remove punctuation (default: True)
113
- Returns:
114
- - str, the normalized string
115
- """
116
- # Remove all white spaces. Required e.g for seagull vs. sea gull
117
- no_spaces = re.sub(r"\s", "", input_str)
118
-
119
- # Remove punctuation, if specified.
120
- if remove_punct:
121
- translator = str.maketrans("", "", string.punctuation)
122
- return no_spaces.lower().translate(translator)
123
- else:
124
- return no_spaces.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/mdconvert.py DELETED
@@ -1,949 +0,0 @@
1
- # This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
2
- # Thanks to Microsoft researchers for open-sourcing this!
3
- # type: ignore
4
- import base64
5
- import copy
6
- import html
7
- import json
8
- import mimetypes
9
- import os
10
- import re
11
- import shutil
12
- import subprocess
13
- import sys
14
- import tempfile
15
- import traceback
16
- from typing import Any, Dict, List, Optional, Union
17
- from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
18
-
19
- import mammoth
20
- import markdownify
21
- import pandas as pd
22
- import pdfminer
23
- import pdfminer.high_level
24
- import pptx
25
-
26
- # File-format detection
27
- import puremagic
28
- import pydub
29
- import requests
30
- import speech_recognition as sr
31
- from bs4 import BeautifulSoup
32
- from youtube_transcript_api import YouTubeTranscriptApi
33
- from youtube_transcript_api.formatters import SRTFormatter
34
-
35
-
36
- class _CustomMarkdownify(markdownify.MarkdownConverter):
37
- """
38
- A custom version of markdownify's MarkdownConverter. Changes include:
39
-
40
- - Altering the default heading style to use '#', '##', etc.
41
- - Removing javascript hyperlinks.
42
- - Truncating images with large data:uri sources.
43
- - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
44
- """
45
-
46
- def __init__(self, **options: Any):
47
- options["heading_style"] = options.get("heading_style", markdownify.ATX)
48
- # Explicitly cast options to the expected type if necessary
49
- super().__init__(**options)
50
-
51
- def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
52
- """Same as usual, but be sure to start with a new line"""
53
- if not convert_as_inline:
54
- if not re.search(r"^\n", text):
55
- return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
56
-
57
- return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
58
-
59
- def convert_a(self, el: Any, text: str, convert_as_inline: bool):
60
- """Same as usual converter, but removes Javascript links and escapes URIs."""
61
- prefix, suffix, text = markdownify.chomp(text) # type: ignore
62
- if not text:
63
- return ""
64
- href = el.get("href")
65
- title = el.get("title")
66
-
67
- # Escape URIs and skip non-http or file schemes
68
- if href:
69
- try:
70
- parsed_url = urlparse(href) # type: ignore
71
- if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
72
- return "%s%s%s" % (prefix, text, suffix)
73
- href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
74
- except ValueError: # It's not clear if this ever gets thrown
75
- return "%s%s%s" % (prefix, text, suffix)
76
-
77
- # For the replacement see #29: text nodes underscores are escaped
78
- if (
79
- self.options["autolinks"]
80
- and text.replace(r"\_", "_") == href
81
- and not title
82
- and not self.options["default_title"]
83
- ):
84
- # Shortcut syntax
85
- return "<%s>" % href
86
- if self.options["default_title"] and not title:
87
- title = href
88
- title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
89
- return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
90
-
91
- def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
92
- """Same as usual converter, but removes data URIs"""
93
-
94
- alt = el.attrs.get("alt", None) or ""
95
- src = el.attrs.get("src", None) or ""
96
- title = el.attrs.get("title", None) or ""
97
- title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
98
- if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
99
- return alt
100
-
101
- # Remove dataURIs
102
- if src.startswith("data:"):
103
- src = src.split(",")[0] + "..."
104
-
105
- return "![%s](%s%s)" % (alt, src, title_part)
106
-
107
- def convert_soup(self, soup: Any) -> str:
108
- return super().convert_soup(soup) # type: ignore
109
-
110
-
111
- class DocumentConverterResult:
112
- """The result of converting a document to text."""
113
-
114
- def __init__(self, title: Union[str, None] = None, text_content: str = ""):
115
- self.title: Union[str, None] = title
116
- self.text_content: str = text_content
117
-
118
-
119
- class DocumentConverter:
120
- """Abstract superclass of all DocumentConverters."""
121
-
122
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
123
- raise NotImplementedError()
124
-
125
-
126
- class PlainTextConverter(DocumentConverter):
127
- """Anything with content type text/plain"""
128
-
129
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
130
- # Guess the content type from any file extension that might be around
131
- content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
132
-
133
- # Only accept text files
134
- if content_type is None:
135
- return None
136
- # elif "text/" not in content_type.lower():
137
- # return None
138
-
139
- text_content = ""
140
- with open(local_path, "rt", encoding="utf-8") as fh:
141
- text_content = fh.read()
142
- return DocumentConverterResult(
143
- title=None,
144
- text_content=text_content,
145
- )
146
-
147
-
148
- class HtmlConverter(DocumentConverter):
149
- """Anything with content type text/html"""
150
-
151
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
152
- # Bail if not html
153
- extension = kwargs.get("file_extension", "")
154
- if extension.lower() not in [".html", ".htm"]:
155
- return None
156
-
157
- result = None
158
- with open(local_path, "rt", encoding="utf-8") as fh:
159
- result = self._convert(fh.read())
160
-
161
- return result
162
-
163
- def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
164
- """Helper function that converts and HTML string."""
165
-
166
- # Parse the string
167
- soup = BeautifulSoup(html_content, "html.parser")
168
-
169
- # Remove javascript and style blocks
170
- for script in soup(["script", "style"]):
171
- script.extract()
172
-
173
- # Print only the main content
174
- body_elm = soup.find("body")
175
- webpage_text = ""
176
- if body_elm:
177
- webpage_text = _CustomMarkdownify().convert_soup(body_elm)
178
- else:
179
- webpage_text = _CustomMarkdownify().convert_soup(soup)
180
-
181
- assert isinstance(webpage_text, str)
182
-
183
- return DocumentConverterResult(
184
- title=None if soup.title is None else soup.title.string, text_content=webpage_text
185
- )
186
-
187
-
188
- class WikipediaConverter(DocumentConverter):
189
- """Handle Wikipedia pages separately, focusing only on the main document content."""
190
-
191
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
192
- # Bail if not Wikipedia
193
- extension = kwargs.get("file_extension", "")
194
- if extension.lower() not in [".html", ".htm"]:
195
- return None
196
- url = kwargs.get("url", "")
197
- if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
198
- return None
199
-
200
- # Parse the file
201
- soup = None
202
- with open(local_path, "rt", encoding="utf-8") as fh:
203
- soup = BeautifulSoup(fh.read(), "html.parser")
204
-
205
- # Remove javascript and style blocks
206
- for script in soup(["script", "style"]):
207
- script.extract()
208
-
209
- # Print only the main content
210
- body_elm = soup.find("div", {"id": "mw-content-text"})
211
- title_elm = soup.find("span", {"class": "mw-page-title-main"})
212
-
213
- webpage_text = ""
214
- main_title = None if soup.title is None else soup.title.string
215
-
216
- if body_elm:
217
- # What's the title
218
- if title_elm and len(title_elm) > 0:
219
- main_title = title_elm.string # type: ignore
220
- assert isinstance(main_title, str)
221
-
222
- # Convert the page
223
- webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
224
- else:
225
- webpage_text = _CustomMarkdownify().convert_soup(soup)
226
-
227
- return DocumentConverterResult(
228
- title=main_title,
229
- text_content=webpage_text,
230
- )
231
-
232
-
233
- class YouTubeConverter(DocumentConverter):
234
- """Handle YouTube specially, focusing on the video title, description, and transcript."""
235
-
236
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
237
- # Bail if not YouTube
238
- extension = kwargs.get("file_extension", "")
239
- if extension.lower() not in [".html", ".htm"]:
240
- return None
241
- url = kwargs.get("url", "")
242
- if not url.startswith("https://www.youtube.com/watch?"):
243
- return None
244
-
245
- # Parse the file
246
- soup = None
247
- with open(local_path, "rt", encoding="utf-8") as fh:
248
- soup = BeautifulSoup(fh.read(), "html.parser")
249
-
250
- # Read the meta tags
251
- assert soup.title is not None and soup.title.string is not None
252
- metadata: Dict[str, str] = {"title": soup.title.string}
253
- for meta in soup(["meta"]):
254
- for a in meta.attrs:
255
- if a in ["itemprop", "property", "name"]:
256
- metadata[meta[a]] = meta.get("content", "")
257
- break
258
-
259
- # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
260
- try:
261
- for script in soup(["script"]):
262
- content = script.text
263
- if "ytInitialData" in content:
264
- lines = re.split(r"\r?\n", content)
265
- obj_start = lines[0].find("{")
266
- obj_end = lines[0].rfind("}")
267
- if obj_start >= 0 and obj_end >= 0:
268
- data = json.loads(lines[0][obj_start : obj_end + 1])
269
- attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
270
- if attrdesc:
271
- metadata["description"] = str(attrdesc["content"])
272
- break
273
- except Exception:
274
- pass
275
-
276
- # Start preparing the page
277
- webpage_text = "# YouTube\n"
278
-
279
- title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
280
- assert isinstance(title, str)
281
-
282
- if title:
283
- webpage_text += f"\n## {title}\n"
284
-
285
- stats = ""
286
- views = self._get(metadata, ["interactionCount"]) # type: ignore
287
- if views:
288
- stats += f"- **Views:** {views}\n"
289
-
290
- keywords = self._get(metadata, ["keywords"]) # type: ignore
291
- if keywords:
292
- stats += f"- **Keywords:** {keywords}\n"
293
-
294
- runtime = self._get(metadata, ["duration"]) # type: ignore
295
- if runtime:
296
- stats += f"- **Runtime:** {runtime}\n"
297
-
298
- if len(stats) > 0:
299
- webpage_text += f"\n### Video Metadata\n{stats}\n"
300
-
301
- description = self._get(metadata, ["description", "og:description"]) # type: ignore
302
- if description:
303
- webpage_text += f"\n### Description\n{description}\n"
304
-
305
- transcript_text = ""
306
- parsed_url = urlparse(url) # type: ignore
307
- params = parse_qs(parsed_url.query) # type: ignore
308
- if "v" in params:
309
- assert isinstance(params["v"][0], str)
310
- video_id = str(params["v"][0])
311
- try:
312
- # Must be a single transcript.
313
- transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
314
- # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
315
- # Alternative formatting:
316
- transcript_text = SRTFormatter().format_transcript(transcript)
317
- except Exception:
318
- pass
319
- if transcript_text:
320
- webpage_text += f"\n### Transcript\n{transcript_text}\n"
321
-
322
- title = title if title else soup.title.string
323
- assert isinstance(title, str)
324
-
325
- return DocumentConverterResult(
326
- title=title,
327
- text_content=webpage_text,
328
- )
329
-
330
- def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
331
- for k in keys:
332
- if k in metadata:
333
- return metadata[k]
334
- return default
335
-
336
- def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
337
- if isinstance(json, list):
338
- for elm in json:
339
- ret = self._findKey(elm, key)
340
- if ret is not None:
341
- return ret
342
- elif isinstance(json, dict):
343
- for k in json:
344
- if k == key:
345
- return json[k]
346
- else:
347
- ret = self._findKey(json[k], key)
348
- if ret is not None:
349
- return ret
350
- return None
351
-
352
-
353
- class PdfConverter(DocumentConverter):
354
- """
355
- Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
356
- """
357
-
358
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
359
- # Bail if not a PDF
360
- extension = kwargs.get("file_extension", "")
361
- if extension.lower() != ".pdf":
362
- return None
363
-
364
- return DocumentConverterResult(
365
- title=None,
366
- text_content=pdfminer.high_level.extract_text(local_path),
367
- )
368
-
369
-
370
- class DocxConverter(HtmlConverter):
371
- """
372
- Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
373
- """
374
-
375
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
376
- # Bail if not a DOCX
377
- extension = kwargs.get("file_extension", "")
378
- if extension.lower() != ".docx":
379
- return None
380
-
381
- result = None
382
- with open(local_path, "rb") as docx_file:
383
- result = mammoth.convert_to_html(docx_file)
384
- html_content = result.value
385
- result = self._convert(html_content)
386
-
387
- return result
388
-
389
-
390
- class XlsxConverter(HtmlConverter):
391
- """
392
- Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
393
- """
394
-
395
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
396
- # Bail if not a XLSX
397
- extension = kwargs.get("file_extension", "")
398
- if extension.lower() not in [".xlsx", ".xls"]:
399
- return None
400
-
401
- sheets = pd.read_excel(local_path, sheet_name=None)
402
- md_content = ""
403
- for s in sheets:
404
- md_content += f"## {s}\n"
405
- html_content = sheets[s].to_html(index=False)
406
- md_content += self._convert(html_content).text_content.strip() + "\n\n"
407
-
408
- return DocumentConverterResult(
409
- title=None,
410
- text_content=md_content.strip(),
411
- )
412
-
413
-
414
- class PptxConverter(HtmlConverter):
415
- """
416
- Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
417
- """
418
-
419
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
420
- # Bail if not a PPTX
421
- extension = kwargs.get("file_extension", "")
422
- if extension.lower() != ".pptx":
423
- return None
424
-
425
- md_content = ""
426
-
427
- presentation = pptx.Presentation(local_path)
428
- slide_num = 0
429
- for slide in presentation.slides:
430
- slide_num += 1
431
-
432
- md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
433
-
434
- title = slide.shapes.title
435
- for shape in slide.shapes:
436
- # Pictures
437
- if self._is_picture(shape):
438
- # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
439
- alt_text = ""
440
- try:
441
- alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
442
- except Exception:
443
- pass
444
-
445
- # A placeholder name
446
- filename = re.sub(r"\W", "", shape.name) + ".jpg"
447
- md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
448
-
449
- # Tables
450
- if self._is_table(shape):
451
- html_table = "<html><body><table>"
452
- first_row = True
453
- for row in shape.table.rows:
454
- html_table += "<tr>"
455
- for cell in row.cells:
456
- if first_row:
457
- html_table += "<th>" + html.escape(cell.text) + "</th>"
458
- else:
459
- html_table += "<td>" + html.escape(cell.text) + "</td>"
460
- html_table += "</tr>"
461
- first_row = False
462
- html_table += "</table></body></html>"
463
- md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
464
-
465
- # Text areas
466
- elif shape.has_text_frame:
467
- if shape == title:
468
- md_content += "# " + shape.text.lstrip() + "\n"
469
- else:
470
- md_content += shape.text + "\n"
471
-
472
- md_content = md_content.strip()
473
-
474
- if slide.has_notes_slide:
475
- md_content += "\n\n### Notes:\n"
476
- notes_frame = slide.notes_slide.notes_text_frame
477
- if notes_frame is not None:
478
- md_content += notes_frame.text
479
- md_content = md_content.strip()
480
-
481
- return DocumentConverterResult(
482
- title=None,
483
- text_content=md_content.strip(),
484
- )
485
-
486
- def _is_picture(self, shape):
487
- if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
488
- return True
489
- if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
490
- if hasattr(shape, "image"):
491
- return True
492
- return False
493
-
494
- def _is_table(self, shape):
495
- if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
496
- return True
497
- return False
498
-
499
-
500
- class MediaConverter(DocumentConverter):
501
- """
502
- Abstract class for multi-modal media (e.g., images and audio)
503
- """
504
-
505
- def _get_metadata(self, local_path):
506
- exiftool = shutil.which("exiftool")
507
- if not exiftool:
508
- return None
509
- else:
510
- try:
511
- result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
512
- return json.loads(result)[0]
513
- except Exception:
514
- return None
515
-
516
-
517
- class WavConverter(MediaConverter):
518
- """
519
- Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
520
- """
521
-
522
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
523
- # Bail if not a XLSX
524
- extension = kwargs.get("file_extension", "")
525
- if extension.lower() != ".wav":
526
- return None
527
-
528
- md_content = ""
529
-
530
- # Add metadata
531
- metadata = self._get_metadata(local_path)
532
- if metadata:
533
- for f in [
534
- "Title",
535
- "Artist",
536
- "Author",
537
- "Band",
538
- "Album",
539
- "Genre",
540
- "Track",
541
- "DateTimeOriginal",
542
- "CreateDate",
543
- "Duration",
544
- ]:
545
- if f in metadata:
546
- md_content += f"{f}: {metadata[f]}\n"
547
-
548
- # Transcribe
549
- try:
550
- transcript = self._transcribe_audio(local_path)
551
- md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
552
- except Exception:
553
- md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
554
-
555
- return DocumentConverterResult(
556
- title=None,
557
- text_content=md_content.strip(),
558
- )
559
-
560
- def _transcribe_audio(self, local_path) -> str:
561
- recognizer = sr.Recognizer()
562
- with sr.AudioFile(local_path) as source:
563
- audio = recognizer.record(source)
564
- return recognizer.recognize_google(audio).strip()
565
-
566
-
567
- class Mp3Converter(WavConverter):
568
- """
569
- Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
570
- """
571
-
572
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
573
- # Bail if not a MP3
574
- extension = kwargs.get("file_extension", "")
575
- if extension.lower() != ".mp3":
576
- return None
577
-
578
- md_content = ""
579
-
580
- # Add metadata
581
- metadata = self._get_metadata(local_path)
582
- if metadata:
583
- for f in [
584
- "Title",
585
- "Artist",
586
- "Author",
587
- "Band",
588
- "Album",
589
- "Genre",
590
- "Track",
591
- "DateTimeOriginal",
592
- "CreateDate",
593
- "Duration",
594
- ]:
595
- if f in metadata:
596
- md_content += f"{f}: {metadata[f]}\n"
597
-
598
- # Transcribe
599
- handle, temp_path = tempfile.mkstemp(suffix=".wav")
600
- os.close(handle)
601
- try:
602
- sound = pydub.AudioSegment.from_mp3(local_path)
603
- sound.export(temp_path, format="wav")
604
-
605
- _args = dict()
606
- _args.update(kwargs)
607
- _args["file_extension"] = ".wav"
608
-
609
- try:
610
- transcript = super()._transcribe_audio(temp_path).strip()
611
- md_content += "\n\n### Audio Transcript:\n" + (
612
- "[No speech detected]" if transcript == "" else transcript
613
- )
614
- except Exception:
615
- md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
616
-
617
- finally:
618
- os.unlink(temp_path)
619
-
620
- # Return the result
621
- return DocumentConverterResult(
622
- title=None,
623
- text_content=md_content.strip(),
624
- )
625
-
626
-
627
- class ImageConverter(MediaConverter):
628
- """
629
- Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
630
- """
631
-
632
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
633
- # Bail if not a XLSX
634
- extension = kwargs.get("file_extension", "")
635
- if extension.lower() not in [".jpg", ".jpeg", ".png"]:
636
- return None
637
-
638
- md_content = ""
639
-
640
- # Add metadata
641
- metadata = self._get_metadata(local_path)
642
- if metadata:
643
- for f in [
644
- "ImageSize",
645
- "Title",
646
- "Caption",
647
- "Description",
648
- "Keywords",
649
- "Artist",
650
- "Author",
651
- "DateTimeOriginal",
652
- "CreateDate",
653
- "GPSPosition",
654
- ]:
655
- if f in metadata:
656
- md_content += f"{f}: {metadata[f]}\n"
657
-
658
- # Try describing the image with GPTV
659
- mlm_client = kwargs.get("mlm_client")
660
- mlm_model = kwargs.get("mlm_model")
661
- if mlm_client is not None and mlm_model is not None:
662
- md_content += (
663
- "\n# Description:\n"
664
- + self._get_mlm_description(
665
- local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
666
- ).strip()
667
- + "\n"
668
- )
669
-
670
- return DocumentConverterResult(
671
- title=None,
672
- text_content=md_content,
673
- )
674
-
675
- def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
676
- if prompt is None or prompt.strip() == "":
677
- prompt = "Write a detailed caption for this image."
678
-
679
- sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
680
-
681
- data_uri = ""
682
- with open(local_path, "rb") as image_file:
683
- content_type, encoding = mimetypes.guess_type("_dummy" + extension)
684
- if content_type is None:
685
- content_type = "image/jpeg"
686
- image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
687
- data_uri = f"data:{content_type};base64,{image_base64}"
688
-
689
- messages = [
690
- {
691
- "role": "user",
692
- "content": [
693
- {"type": "text", "text": prompt},
694
- {
695
- "type": "image_url",
696
- "image_url": {
697
- "url": data_uri,
698
- },
699
- },
700
- ],
701
- }
702
- ]
703
-
704
- response = client.chat.completions.create(model=model, messages=messages)
705
- return response.choices[0].message.content
706
-
707
-
708
- class FileConversionException(BaseException):
709
- pass
710
-
711
-
712
- class UnsupportedFormatException(BaseException):
713
- pass
714
-
715
-
716
- class MarkdownConverter:
717
- """(In preview) An extremely simple text-based document reader, suitable for LLM use.
718
- This reader will convert common file-types or webpages to Markdown."""
719
-
720
- def __init__(
721
- self,
722
- requests_session: Optional[requests.Session] = None,
723
- mlm_client: Optional[Any] = None,
724
- mlm_model: Optional[Any] = None,
725
- ):
726
- if requests_session is None:
727
- self._requests_session = requests.Session()
728
- else:
729
- self._requests_session = requests_session
730
-
731
- self._mlm_client = mlm_client
732
- self._mlm_model = mlm_model
733
-
734
- self._page_converters: List[DocumentConverter] = []
735
-
736
- # Register converters for successful browsing operations
737
- # Later registrations are tried first / take higher priority than earlier registrations
738
- # To this end, the most specific converters should appear below the most generic converters
739
- self.register_page_converter(PlainTextConverter())
740
- self.register_page_converter(HtmlConverter())
741
- self.register_page_converter(WikipediaConverter())
742
- self.register_page_converter(YouTubeConverter())
743
- self.register_page_converter(DocxConverter())
744
- self.register_page_converter(XlsxConverter())
745
- self.register_page_converter(PptxConverter())
746
- self.register_page_converter(WavConverter())
747
- self.register_page_converter(Mp3Converter())
748
- self.register_page_converter(ImageConverter())
749
- self.register_page_converter(PdfConverter())
750
-
751
- def convert(
752
- self, source: Union[str, requests.Response], **kwargs: Any
753
- ) -> DocumentConverterResult: # TODO: deal with kwargs
754
- """
755
- Args:
756
- - source: can be a string representing a path or url, or a requests.response object
757
- - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
758
- """
759
-
760
- # Local path or url
761
- if isinstance(source, str):
762
- if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
763
- return self.convert_url(source, **kwargs)
764
- else:
765
- return self.convert_local(source, **kwargs)
766
- # Request response
767
- elif isinstance(source, requests.Response):
768
- return self.convert_response(source, **kwargs)
769
-
770
- def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
771
- # Prepare a list of extensions to try (in order of priority)
772
- ext = kwargs.get("file_extension")
773
- extensions = [ext] if ext is not None else []
774
-
775
- # Get extension alternatives from the path and puremagic
776
- base, ext = os.path.splitext(path)
777
- self._append_ext(extensions, ext)
778
- self._append_ext(extensions, self._guess_ext_magic(path))
779
-
780
- # Convert
781
- return self._convert(path, extensions, **kwargs)
782
-
783
- # TODO what should stream's type be?
784
- def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
785
- # Prepare a list of extensions to try (in order of priority)
786
- ext = kwargs.get("file_extension")
787
- extensions = [ext] if ext is not None else []
788
-
789
- # Save the file locally to a temporary file. It will be deleted before this method exits
790
- handle, temp_path = tempfile.mkstemp()
791
- fh = os.fdopen(handle, "wb")
792
- result = None
793
- try:
794
- # Write to the temporary file
795
- content = stream.read()
796
- if isinstance(content, str):
797
- fh.write(content.encode("utf-8"))
798
- else:
799
- fh.write(content)
800
- fh.close()
801
-
802
- # Use puremagic to check for more extension options
803
- self._append_ext(extensions, self._guess_ext_magic(temp_path))
804
-
805
- # Convert
806
- result = self._convert(temp_path, extensions, **kwargs)
807
- # Clean up
808
- finally:
809
- try:
810
- fh.close()
811
- except Exception:
812
- pass
813
- os.unlink(temp_path)
814
-
815
- return result
816
-
817
- def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
818
- # Send a HTTP request to the URL
819
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
820
- response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
821
- response.raise_for_status()
822
- return self.convert_response(response, **kwargs)
823
-
824
- def convert_response(
825
- self, response: requests.Response, **kwargs: Any
826
- ) -> DocumentConverterResult: # TODO fix kwargs type
827
- # Prepare a list of extensions to try (in order of priority)
828
- ext = kwargs.get("file_extension")
829
- extensions = [ext] if ext is not None else []
830
-
831
- # Guess from the mimetype
832
- content_type = response.headers.get("content-type", "").split(";")[0]
833
- self._append_ext(extensions, mimetypes.guess_extension(content_type))
834
-
835
- # Read the content disposition if there is one
836
- content_disposition = response.headers.get("content-disposition", "")
837
- m = re.search(r"filename=([^;]+)", content_disposition)
838
- if m:
839
- base, ext = os.path.splitext(m.group(1).strip("\"'"))
840
- self._append_ext(extensions, ext)
841
-
842
- # Read from the extension from the path
843
- base, ext = os.path.splitext(urlparse(response.url).path)
844
- self._append_ext(extensions, ext)
845
-
846
- # Save the file locally to a temporary file. It will be deleted before this method exits
847
- handle, temp_path = tempfile.mkstemp()
848
- fh = os.fdopen(handle, "wb")
849
- result = None
850
- try:
851
- # Download the file
852
- for chunk in response.iter_content(chunk_size=512):
853
- fh.write(chunk)
854
- fh.close()
855
-
856
- # Use puremagic to check for more extension options
857
- self._append_ext(extensions, self._guess_ext_magic(temp_path))
858
-
859
- # Convert
860
- result = self._convert(temp_path, extensions, url=response.url)
861
- except Exception as e:
862
- print(f"Error in converting: {e}")
863
-
864
- # Clean up
865
- finally:
866
- try:
867
- fh.close()
868
- except Exception:
869
- pass
870
- os.unlink(temp_path)
871
-
872
- return result
873
-
874
- def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
875
- error_trace = ""
876
- for ext in extensions + [None]: # Try last with no extension
877
- for converter in self._page_converters:
878
- _kwargs = copy.deepcopy(kwargs)
879
-
880
- # Overwrite file_extension appropriately
881
- if ext is None:
882
- if "file_extension" in _kwargs:
883
- del _kwargs["file_extension"]
884
- else:
885
- _kwargs.update({"file_extension": ext})
886
-
887
- # Copy any additional global options
888
- if "mlm_client" not in _kwargs and self._mlm_client is not None:
889
- _kwargs["mlm_client"] = self._mlm_client
890
-
891
- if "mlm_model" not in _kwargs and self._mlm_model is not None:
892
- _kwargs["mlm_model"] = self._mlm_model
893
-
894
- # If we hit an error log it and keep trying
895
- try:
896
- res = converter.convert(local_path, **_kwargs)
897
- except Exception:
898
- error_trace = ("\n\n" + traceback.format_exc()).strip()
899
-
900
- if res is not None:
901
- # Normalize the content
902
- res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
903
- res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
904
-
905
- # Todo
906
- return res
907
-
908
- # If we got this far without success, report any exceptions
909
- if len(error_trace) > 0:
910
- raise FileConversionException(
911
- f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
912
- )
913
-
914
- # Nothing can handle it!
915
- raise UnsupportedFormatException(
916
- f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
917
- )
918
-
919
- def _append_ext(self, extensions, ext):
920
- """Append a unique non-None, non-empty extension to a list of extensions."""
921
- if ext is None:
922
- return
923
- ext = ext.strip()
924
- if ext == "":
925
- return
926
- # if ext not in extensions:
927
- if True:
928
- extensions.append(ext)
929
-
930
- def _guess_ext_magic(self, path):
931
- """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
932
- # Use puremagic to guess
933
- try:
934
- guesses = puremagic.magic_file(path)
935
- if len(guesses) > 0:
936
- ext = guesses[0].extension.strip()
937
- if len(ext) > 0:
938
- return ext
939
- except FileNotFoundError:
940
- pass
941
- except IsADirectoryError:
942
- pass
943
- except PermissionError:
944
- pass
945
- return None
946
-
947
- def register_page_converter(self, converter: DocumentConverter) -> None:
948
- """Register a page text converter."""
949
- self._page_converters.insert(0, converter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/reformulator.py DELETED
@@ -1,86 +0,0 @@
1
- # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
- # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
- import copy
4
-
5
- from smolagents.models import MessageRole, Model
6
-
7
-
8
- def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
9
- messages = [
10
- {
11
- "role": MessageRole.SYSTEM,
12
- "content": [
13
- {
14
- "type": "text",
15
- "text": f"""Earlier you were asked the following:
16
-
17
- {original_task}
18
-
19
- Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
20
- }
21
- ],
22
- }
23
- ]
24
-
25
- # The first message just repeats the question, so remove it
26
- # if len(inner_messages) > 1:
27
- # del inner_messages[0]
28
-
29
- # copy them to this context
30
- try:
31
- for message in inner_messages:
32
- if not message.get("content"):
33
- continue
34
- message = copy.deepcopy(message)
35
- message["role"] = MessageRole.USER
36
- messages.append(message)
37
- except Exception:
38
- messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
39
-
40
- # ask for the final answer
41
- messages.append(
42
- {
43
- "role": MessageRole.USER,
44
- "content": [
45
- {
46
- "type": "text",
47
- "text": f"""
48
- Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
49
-
50
- {original_task}
51
-
52
- To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
53
- Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
- ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
55
- If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
56
- If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
57
- If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
58
- If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
59
- """,
60
- }
61
- ],
62
- }
63
- )
64
-
65
- response = reformulation_model(messages).content
66
-
67
- final_answer = response.split("FINAL ANSWER: ")[-1].strip()
68
- print("> Reformulated answer: ", final_answer)
69
-
70
- # if "unable to determine" in final_answer.lower():
71
- # messages.append({"role": MessageRole.ASSISTANT, "content": response })
72
- # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
73
- # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
74
-
75
- # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
76
- # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
77
- # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
78
- # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
79
- # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
80
- # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
81
- # """.strip()}]})
82
-
83
- # response = model(messages).content
84
- # print("\n>>>Making an educated guess.\n", response)
85
- # final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
86
- return final_answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/run_agents.py DELETED
@@ -1,87 +0,0 @@
1
- import json
2
- import os
3
- import shutil
4
- import textwrap
5
- from pathlib import Path
6
-
7
- # import tqdm.asyncio
8
- from smolagents.utils import AgentError
9
-
10
-
11
- def serialize_agent_error(obj):
12
- if isinstance(obj, AgentError):
13
- return {"error_type": obj.__class__.__name__, "message": obj.message}
14
- else:
15
- return str(obj)
16
-
17
-
18
- def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
19
- prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
20
- {question}. But do not try to answer the question directly!
21
- Do not add any information that is not present in the image."""
22
- return visual_inspection_tool(image_path=file_name, question=prompt)
23
-
24
-
25
- def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
26
- prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
27
- {question}. But do not try to answer the question directly!
28
- Do not add any information that is not present in the document."""
29
- return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
30
-
31
-
32
- def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
33
- file_extension = file_path.split(".")[-1]
34
- if file_extension in ["png", "jpg", "jpeg"]:
35
- file_description = f" - Attached image: {file_path}"
36
- file_description += (
37
- f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
38
- )
39
- return file_description
40
- elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
41
- file_description = f" - Attached document: {file_path}"
42
- image_path = file_path.split(".")[0] + ".png"
43
- if os.path.exists(image_path):
44
- description = get_image_description(image_path, question, visual_inspection_tool)
45
- else:
46
- description = get_document_description(file_path, question, document_inspection_tool)
47
- file_description += f"\n -> File description: {description}"
48
- return file_description
49
- elif file_extension in ["mp3", "m4a", "wav"]:
50
- return f" - Attached audio: {file_path}"
51
- else:
52
- return f" - Attached file: {file_path}"
53
-
54
-
55
- def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
56
- folder_path = file_path.replace(".zip", "")
57
- os.makedirs(folder_path, exist_ok=True)
58
- shutil.unpack_archive(file_path, folder_path)
59
-
60
- prompt_use_files = ""
61
- for root, dirs, files in os.walk(folder_path):
62
- for file in files:
63
- file_path = os.path.join(root, file)
64
- prompt_use_files += "\n" + textwrap.indent(
65
- get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
66
- prefix=" ",
67
- )
68
- return prompt_use_files
69
-
70
-
71
- def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
72
- f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
73
- done = set()
74
- if f.exists():
75
- with open(f, encoding="utf-8") as fh:
76
- done = {json.loads(line)["task_id"] for line in fh if line.strip()}
77
-
78
- tasks = []
79
- for i in range(total):
80
- task_id = int(data[i]["task_id"])
81
- if task_id not in done:
82
- if tasks_ids is not None:
83
- if task_id in tasks_ids:
84
- tasks.append(data[i])
85
- else:
86
- tasks.append(data[i])
87
- return tasks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/text_inspector_tool.py DELETED
@@ -1,122 +0,0 @@
1
- from typing import Optional
2
-
3
- from smolagents import Tool
4
- from smolagents.models import MessageRole, Model
5
-
6
- from .mdconvert import MarkdownConverter
7
-
8
-
9
- class TextInspectorTool(Tool):
10
- name = "inspect_file_as_text"
11
- description = """
12
- You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
13
- This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
14
-
15
- inputs = {
16
- "file_path": {
17
- "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
18
- "type": "string",
19
- },
20
- "question": {
21
- "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
22
- "type": "string",
23
- "nullable": True,
24
- },
25
- }
26
- output_type = "string"
27
- md_converter = MarkdownConverter()
28
-
29
- def __init__(self, model: Model, text_limit: int):
30
- super().__init__()
31
- self.model = model
32
- self.text_limit = text_limit
33
-
34
- def forward_initial_exam_mode(self, file_path, question):
35
- result = self.md_converter.convert(file_path)
36
-
37
- if file_path[-4:] in [".png", ".jpg"]:
38
- raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
39
-
40
- if ".zip" in file_path:
41
- return result.text_content
42
-
43
- if not question:
44
- return result.text_content
45
-
46
- if len(result.text_content) < 4000:
47
- return "Document content: " + result.text_content
48
-
49
- messages = [
50
- {
51
- "role": MessageRole.SYSTEM,
52
- "content": [
53
- {
54
- "type": "text",
55
- "text": "Here is a file:\n### "
56
- + str(result.title)
57
- + "\n\n"
58
- + result.text_content[: self.text_limit],
59
- }
60
- ],
61
- },
62
- {
63
- "role": MessageRole.USER,
64
- "content": [
65
- {
66
- "type": "text",
67
- "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
68
- + question
69
- + "\n\nDon't answer the question yourself! Just provide useful notes on the document",
70
- }
71
- ],
72
- },
73
- ]
74
- return self.model(messages).content
75
-
76
- def forward(self, file_path, question: Optional[str] = None) -> str:
77
- result = self.md_converter.convert(file_path)
78
-
79
- if file_path[-4:] in [".png", ".jpg"]:
80
- raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
81
-
82
- if ".zip" in file_path:
83
- return result.text_content
84
-
85
- if not question:
86
- return result.text_content
87
-
88
- messages = [
89
- {
90
- "role": MessageRole.SYSTEM,
91
- "content": [
92
- {
93
- "type": "text",
94
- "text": "You will have to write a short caption for this file, then answer this question:"
95
- + question,
96
- }
97
- ],
98
- },
99
- {
100
- "role": MessageRole.USER,
101
- "content": [
102
- {
103
- "type": "text",
104
- "text": "Here is the complete file:\n### "
105
- + str(result.title)
106
- + "\n\n"
107
- + result.text_content[: self.text_limit],
108
- }
109
- ],
110
- },
111
- {
112
- "role": MessageRole.USER,
113
- "content": [
114
- {
115
- "type": "text",
116
- "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
117
- + question,
118
- }
119
- ],
120
- },
121
- ]
122
- return self.model(messages).content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/text_web_browser.py DELETED
@@ -1,564 +0,0 @@
1
- # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
- # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
- import mimetypes
4
- import os
5
- import pathlib
6
- import re
7
- import time
8
- import uuid
9
- from typing import Any, Dict, List, Optional, Tuple, Union
10
- from urllib.parse import unquote, urljoin, urlparse
11
-
12
- import pathvalidate
13
- import requests
14
- from serpapi import GoogleSearch
15
- # from serpapi.google_search import GoogleSearch
16
-
17
- from smolagents import Tool
18
-
19
- from .cookies import COOKIES
20
- from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
21
-
22
-
23
- class SimpleTextBrowser:
24
- """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
25
-
26
- def __init__(
27
- self,
28
- start_page: Optional[str] = None,
29
- viewport_size: Optional[int] = 1024 * 8,
30
- downloads_folder: Optional[Union[str, None]] = None,
31
- serpapi_key: Optional[Union[str, None]] = None,
32
- request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
33
- ):
34
- self.start_page: str = start_page if start_page else "about:blank"
35
- self.viewport_size = viewport_size # Applies only to the standard uri types
36
- self.downloads_folder = downloads_folder
37
- self.history: List[Tuple[str, float]] = list()
38
- self.page_title: Optional[str] = None
39
- self.viewport_current_page = 0
40
- self.viewport_pages: List[Tuple[int, int]] = list()
41
- self.set_address(self.start_page)
42
- self.serpapi_key = serpapi_key
43
- self.request_kwargs = request_kwargs
44
- self.request_kwargs["cookies"] = COOKIES
45
- self._mdconvert = MarkdownConverter()
46
- self._page_content: str = ""
47
-
48
- self._find_on_page_query: Union[str, None] = None
49
- self._find_on_page_last_result: Union[int, None] = None # Location of the last result
50
-
51
- @property
52
- def address(self) -> str:
53
- """Return the address of the current page."""
54
- return self.history[-1][0]
55
-
56
- def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
57
- # TODO: Handle anchors
58
- self.history.append((uri_or_path, time.time()))
59
-
60
- # Handle special URIs
61
- if uri_or_path == "about:blank":
62
- self._set_page_content("")
63
- elif uri_or_path.startswith("google:"):
64
- self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
65
- else:
66
- if (
67
- not uri_or_path.startswith("http:")
68
- and not uri_or_path.startswith("https:")
69
- and not uri_or_path.startswith("file:")
70
- ):
71
- if len(self.history) > 1:
72
- prior_address = self.history[-2][0]
73
- uri_or_path = urljoin(prior_address, uri_or_path)
74
- # Update the address with the fully-qualified path
75
- self.history[-1] = (uri_or_path, self.history[-1][1])
76
- self._fetch_page(uri_or_path)
77
-
78
- self.viewport_current_page = 0
79
- self.find_on_page_query = None
80
- self.find_on_page_viewport = None
81
-
82
- @property
83
- def viewport(self) -> str:
84
- """Return the content of the current viewport."""
85
- bounds = self.viewport_pages[self.viewport_current_page]
86
- return self.page_content[bounds[0] : bounds[1]]
87
-
88
- @property
89
- def page_content(self) -> str:
90
- """Return the full contents of the current page."""
91
- return self._page_content
92
-
93
- def _set_page_content(self, content: str) -> None:
94
- """Sets the text content of the current page."""
95
- self._page_content = content
96
- self._split_pages()
97
- if self.viewport_current_page >= len(self.viewport_pages):
98
- self.viewport_current_page = len(self.viewport_pages) - 1
99
-
100
- def page_down(self) -> None:
101
- self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
102
-
103
- def page_up(self) -> None:
104
- self.viewport_current_page = max(self.viewport_current_page - 1, 0)
105
-
106
- def find_on_page(self, query: str) -> Union[str, None]:
107
- """Searches for the query from the current viewport forward, looping back to the start if necessary."""
108
-
109
- # Did we get here via a previous find_on_page search with the same query?
110
- # If so, map to find_next
111
- if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
112
- return self.find_next()
113
-
114
- # Ok it's a new search start from the current viewport
115
- self._find_on_page_query = query
116
- viewport_match = self._find_next_viewport(query, self.viewport_current_page)
117
- if viewport_match is None:
118
- self._find_on_page_last_result = None
119
- return None
120
- else:
121
- self.viewport_current_page = viewport_match
122
- self._find_on_page_last_result = viewport_match
123
- return self.viewport
124
-
125
- def find_next(self) -> Union[str, None]:
126
- """Scroll to the next viewport that matches the query"""
127
-
128
- if self._find_on_page_query is None:
129
- return None
130
-
131
- starting_viewport = self._find_on_page_last_result
132
- if starting_viewport is None:
133
- starting_viewport = 0
134
- else:
135
- starting_viewport += 1
136
- if starting_viewport >= len(self.viewport_pages):
137
- starting_viewport = 0
138
-
139
- viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
140
- if viewport_match is None:
141
- self._find_on_page_last_result = None
142
- return None
143
- else:
144
- self.viewport_current_page = viewport_match
145
- self._find_on_page_last_result = viewport_match
146
- return self.viewport
147
-
148
- def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
149
- """Search for matches between the starting viewport looping when reaching the end."""
150
-
151
- if query is None:
152
- return None
153
-
154
- # Normalize the query, and convert to a regular expression
155
- nquery = re.sub(r"\*", "__STAR__", query)
156
- nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
157
- nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
158
- nquery = nquery.replace("__STAR__", ".*").lower()
159
-
160
- if nquery.strip() == "":
161
- return None
162
-
163
- idxs = list()
164
- idxs.extend(range(starting_viewport, len(self.viewport_pages)))
165
- idxs.extend(range(0, starting_viewport))
166
-
167
- for i in idxs:
168
- bounds = self.viewport_pages[i]
169
- content = self.page_content[bounds[0] : bounds[1]]
170
-
171
- # TODO: Remove markdown links and images
172
- ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
173
- if re.search(nquery, ncontent):
174
- return i
175
-
176
- return None
177
-
178
- def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
179
- """Update the address, visit the page, and return the content of the viewport."""
180
- self.set_address(path_or_uri, filter_year=filter_year)
181
- return self.viewport
182
-
183
- def _split_pages(self) -> None:
184
- # Do not split search results
185
- if self.address.startswith("google:"):
186
- self.viewport_pages = [(0, len(self._page_content))]
187
- return
188
-
189
- # Handle empty pages
190
- if len(self._page_content) == 0:
191
- self.viewport_pages = [(0, 0)]
192
- return
193
-
194
- # Break the viewport into pages
195
- self.viewport_pages = []
196
- start_idx = 0
197
- while start_idx < len(self._page_content):
198
- end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
199
- # Adjust to end on a space
200
- while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
201
- end_idx += 1
202
- self.viewport_pages.append((start_idx, end_idx))
203
- start_idx = end_idx
204
-
205
- def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
206
- if self.serpapi_key is None:
207
- raise ValueError("Missing SerpAPI key.")
208
-
209
- params = {
210
- "engine": "google",
211
- "q": query,
212
- "api_key": self.serpapi_key,
213
- }
214
- if filter_year is not None:
215
- params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
216
-
217
- search = GoogleSearch(params)
218
- results = search.get_dict()
219
- self.page_title = f"{query} - Search"
220
- if "organic_results" not in results.keys():
221
- raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
222
- if len(results["organic_results"]) == 0:
223
- year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
224
- self._set_page_content(
225
- f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
226
- )
227
- return
228
-
229
- def _prev_visit(url):
230
- for i in range(len(self.history) - 1, -1, -1):
231
- if self.history[i][0] == url:
232
- return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
233
- return ""
234
-
235
- web_snippets: List[str] = list()
236
- idx = 0
237
- if "organic_results" in results:
238
- for page in results["organic_results"]:
239
- idx += 1
240
- date_published = ""
241
- if "date" in page:
242
- date_published = "\nDate published: " + page["date"]
243
-
244
- source = ""
245
- if "source" in page:
246
- source = "\nSource: " + page["source"]
247
-
248
- snippet = ""
249
- if "snippet" in page:
250
- snippet = "\n" + page["snippet"]
251
-
252
- redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
253
-
254
- redacted_version = redacted_version.replace("Your browser can't play this video.", "")
255
- web_snippets.append(redacted_version)
256
-
257
- content = (
258
- f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
259
- + "\n\n".join(web_snippets)
260
- )
261
-
262
- self._set_page_content(content)
263
-
264
- def _fetch_page(self, url: str) -> None:
265
- download_path = ""
266
- try:
267
- if url.startswith("file://"):
268
- download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
269
- res = self._mdconvert.convert_local(download_path)
270
- self.page_title = res.title
271
- self._set_page_content(res.text_content)
272
- else:
273
- # Prepare the request parameters
274
- request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
275
- request_kwargs["stream"] = True
276
-
277
- # Send a HTTP request to the URL
278
- response = requests.get(url, **request_kwargs)
279
- response.raise_for_status()
280
-
281
- # If the HTTP request was successful
282
- content_type = response.headers.get("content-type", "")
283
-
284
- # Text or HTML
285
- if "text/" in content_type.lower():
286
- res = self._mdconvert.convert_response(response)
287
- self.page_title = res.title
288
- self._set_page_content(res.text_content)
289
- # A download
290
- else:
291
- # Try producing a safe filename
292
- fname = None
293
- download_path = None
294
- try:
295
- fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
296
- download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
297
-
298
- suffix = 0
299
- while os.path.exists(download_path) and suffix < 1000:
300
- suffix += 1
301
- base, ext = os.path.splitext(fname)
302
- new_fname = f"{base}__{suffix}{ext}"
303
- download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
304
-
305
- except NameError:
306
- pass
307
-
308
- # No suitable name, so make one
309
- if fname is None:
310
- extension = mimetypes.guess_extension(content_type)
311
- if extension is None:
312
- extension = ".download"
313
- fname = str(uuid.uuid4()) + extension
314
- download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
315
-
316
- # Open a file for writing
317
- with open(download_path, "wb") as fh:
318
- for chunk in response.iter_content(chunk_size=512):
319
- fh.write(chunk)
320
-
321
- # Render it
322
- local_uri = pathlib.Path(download_path).as_uri()
323
- self.set_address(local_uri)
324
-
325
- except UnsupportedFormatException as e:
326
- print(e)
327
- self.page_title = ("Download complete.",)
328
- self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
329
- except FileConversionException as e:
330
- print(e)
331
- self.page_title = ("Download complete.",)
332
- self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
333
- except FileNotFoundError:
334
- self.page_title = "Error 404"
335
- self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
336
- except requests.exceptions.RequestException as request_exception:
337
- try:
338
- self.page_title = f"Error {response.status_code}"
339
-
340
- # If the error was rendered in HTML we might as well render it
341
- content_type = response.headers.get("content-type", "")
342
- if content_type is not None and "text/html" in content_type.lower():
343
- res = self._mdconvert.convert(response)
344
- self.page_title = f"Error {response.status_code}"
345
- self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
346
- else:
347
- text = ""
348
- for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
349
- text += chunk
350
- self.page_title = f"Error {response.status_code}"
351
- self._set_page_content(f"## Error {response.status_code}\n\n{text}")
352
- except NameError:
353
- self.page_title = "Error"
354
- self._set_page_content(f"## Error\n\n{str(request_exception)}")
355
-
356
- def _state(self) -> Tuple[str, str]:
357
- header = f"Address: {self.address}\n"
358
- if self.page_title is not None:
359
- header += f"Title: {self.page_title}\n"
360
-
361
- current_page = self.viewport_current_page
362
- total_pages = len(self.viewport_pages)
363
-
364
- address = self.address
365
- for i in range(len(self.history) - 2, -1, -1): # Start from the second last
366
- if self.history[i][0] == address:
367
- header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
368
- break
369
-
370
- header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
371
- return (header, self.viewport)
372
-
373
-
374
- class SearchInformationTool(Tool):
375
- name = "web_search"
376
- description = "Perform a web search query (think a google search) and returns the search results."
377
- inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
378
- inputs["filter_year"] = {
379
- "type": "string",
380
- "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
381
- "nullable": True,
382
- }
383
- output_type = "string"
384
-
385
- def __init__(self, browser):
386
- super().__init__()
387
- self.browser = browser
388
-
389
- def forward(self, query: str, filter_year: Optional[int] = None) -> str:
390
- self.browser.visit_page(f"google: {query}", filter_year=filter_year)
391
- header, content = self.browser._state()
392
- return header.strip() + "\n=======================\n" + content
393
-
394
-
395
- class VisitTool(Tool):
396
- name = "visit_page"
397
- description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
398
- inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
399
- output_type = "string"
400
-
401
- def __init__(self, browser):
402
- super().__init__()
403
- self.browser = browser
404
-
405
- def forward(self, url: str) -> str:
406
- self.browser.visit_page(url)
407
- header, content = self.browser._state()
408
- return header.strip() + "\n=======================\n" + content
409
-
410
-
411
- class DownloadTool(Tool):
412
- name = "download_file"
413
- description = """
414
- Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
415
- After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
416
- DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
417
- inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
418
- output_type = "string"
419
-
420
- def __init__(self, browser):
421
- super().__init__()
422
- self.browser = browser
423
-
424
- def forward(self, url: str) -> str:
425
- if "arxiv" in url:
426
- url = url.replace("abs", "pdf")
427
- response = requests.get(url)
428
- content_type = response.headers.get("content-type", "")
429
- extension = mimetypes.guess_extension(content_type)
430
- if extension and isinstance(extension, str):
431
- new_path = f"./downloads/file{extension}"
432
- else:
433
- new_path = "./downloads/file.object"
434
-
435
- with open(new_path, "wb") as f:
436
- f.write(response.content)
437
-
438
- if "pdf" in extension or "txt" in extension or "htm" in extension:
439
- raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
440
-
441
- return f"File was downloaded and saved under path {new_path}."
442
-
443
-
444
- class ArchiveSearchTool(Tool):
445
- name = "find_archived_url"
446
- description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
447
- inputs = {
448
- "url": {"type": "string", "description": "The url you need the archive for."},
449
- "date": {
450
- "type": "string",
451
- "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
452
- },
453
- }
454
- output_type = "string"
455
-
456
- def __init__(self, browser):
457
- super().__init__()
458
- self.browser = browser
459
-
460
- def forward(self, url, date) -> str:
461
- no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
462
- archive_url = no_timestamp_url + f"&timestamp={date}"
463
- response = requests.get(archive_url).json()
464
- response_notimestamp = requests.get(no_timestamp_url).json()
465
- if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
466
- closest = response["archived_snapshots"]["closest"]
467
- print("Archive found!", closest)
468
-
469
- elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
470
- closest = response_notimestamp["archived_snapshots"]["closest"]
471
- print("Archive found!", closest)
472
- else:
473
- raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
474
- target_url = closest["url"]
475
- self.browser.visit_page(target_url)
476
- header, content = self.browser._state()
477
- return (
478
- f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
479
- + header.strip()
480
- + "\n=======================\n"
481
- + content
482
- )
483
-
484
-
485
- class PageUpTool(Tool):
486
- name = "page_up"
487
- description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
488
- inputs = {}
489
- output_type = "string"
490
-
491
- def __init__(self, browser):
492
- super().__init__()
493
- self.browser = browser
494
-
495
- def forward(self) -> str:
496
- self.browser.page_up()
497
- header, content = self.browser._state()
498
- return header.strip() + "\n=======================\n" + content
499
-
500
-
501
- class PageDownTool(Tool):
502
- name = "page_down"
503
- description = (
504
- "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
505
- )
506
- inputs = {}
507
- output_type = "string"
508
-
509
- def __init__(self, browser):
510
- super().__init__()
511
- self.browser = browser
512
-
513
- def forward(self) -> str:
514
- self.browser.page_down()
515
- header, content = self.browser._state()
516
- return header.strip() + "\n=======================\n" + content
517
-
518
-
519
- class FinderTool(Tool):
520
- name = "find_on_page_ctrl_f"
521
- description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
522
- inputs = {
523
- "search_string": {
524
- "type": "string",
525
- "description": "The string to search for on the page. This search string supports wildcards like '*'",
526
- }
527
- }
528
- output_type = "string"
529
-
530
- def __init__(self, browser):
531
- super().__init__()
532
- self.browser = browser
533
-
534
- def forward(self, search_string: str) -> str:
535
- find_result = self.browser.find_on_page(search_string)
536
- header, content = self.browser._state()
537
-
538
- if find_result is None:
539
- return (
540
- header.strip()
541
- + f"\n=======================\nThe search string '{search_string}' was not found on this page."
542
- )
543
- else:
544
- return header.strip() + "\n=======================\n" + content
545
-
546
-
547
- class FindNextTool(Tool):
548
- name = "find_next"
549
- description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
550
- inputs = {}
551
- output_type = "string"
552
-
553
- def __init__(self, browser):
554
- super().__init__()
555
- self.browser = browser
556
-
557
- def forward(self) -> str:
558
- find_result = self.browser.find_next()
559
- header, content = self.browser._state()
560
-
561
- if find_result is None:
562
- return header.strip() + "\n=======================\nThe search string was not found on this page."
563
- else:
564
- return header.strip() + "\n=======================\n" + content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/visual_qa.py DELETED
@@ -1,187 +0,0 @@
1
- import base64
2
- import json
3
- import mimetypes
4
- import os
5
- import uuid
6
- from io import BytesIO
7
- from typing import Optional
8
-
9
- import requests
10
- from dotenv import load_dotenv
11
- from huggingface_hub import InferenceClient
12
- from PIL import Image
13
- from transformers import AutoProcessor
14
-
15
- from smolagents import Tool, tool
16
-
17
-
18
- load_dotenv(override=True)
19
-
20
- idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
21
-
22
-
23
- def process_images_and_text(image_path, query, client):
24
- messages = [
25
- {
26
- "role": "user",
27
- "content": [
28
- {"type": "image"},
29
- {"type": "text", "text": query},
30
- ],
31
- },
32
- ]
33
-
34
- prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
35
-
36
- # load images from local directory
37
-
38
- # encode images to strings which can be sent to the endpoint
39
- def encode_local_image(image_path):
40
- # load image
41
- image = Image.open(image_path).convert("RGB")
42
-
43
- # Convert the image to a base64 string
44
- buffer = BytesIO()
45
- image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
46
- base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
47
-
48
- # add string formatting required by the endpoint
49
- image_string = f"data:image/jpeg;base64,{base64_image}"
50
-
51
- return image_string
52
-
53
- image_string = encode_local_image(image_path)
54
- prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)
55
-
56
- payload = {
57
- "inputs": prompt_with_images,
58
- "parameters": {
59
- "return_full_text": False,
60
- "max_new_tokens": 200,
61
- },
62
- }
63
-
64
- return json.loads(client.post(json=payload).decode())[0]
65
-
66
-
67
- # Function to encode the image
68
- def encode_image(image_path):
69
- if image_path.startswith("http"):
70
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
71
- request_kwargs = {
72
- "headers": {"User-Agent": user_agent},
73
- "stream": True,
74
- }
75
-
76
- # Send a HTTP request to the URL
77
- response = requests.get(image_path, **request_kwargs)
78
- response.raise_for_status()
79
- content_type = response.headers.get("content-type", "")
80
-
81
- extension = mimetypes.guess_extension(content_type)
82
- if extension is None:
83
- extension = ".download"
84
-
85
- fname = str(uuid.uuid4()) + extension
86
- download_path = os.path.abspath(os.path.join("downloads", fname))
87
-
88
- with open(download_path, "wb") as fh:
89
- for chunk in response.iter_content(chunk_size=512):
90
- fh.write(chunk)
91
-
92
- image_path = download_path
93
-
94
- with open(image_path, "rb") as image_file:
95
- return base64.b64encode(image_file.read()).decode("utf-8")
96
-
97
-
98
- headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
99
-
100
-
101
- def resize_image(image_path):
102
- img = Image.open(image_path)
103
- width, height = img.size
104
- img = img.resize((int(width / 2), int(height / 2)))
105
- new_image_path = f"resized_{image_path}"
106
- img.save(new_image_path)
107
- return new_image_path
108
-
109
-
110
- class VisualQATool(Tool):
111
- name = "visualizer"
112
- description = "A tool that can answer questions about attached images."
113
- inputs = {
114
- "image_path": {
115
- "description": "The path to the image on which to answer the question",
116
- "type": "string",
117
- },
118
- "question": {"description": "the question to answer", "type": "string", "nullable": True},
119
- }
120
- output_type = "string"
121
-
122
- client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
123
-
124
- def forward(self, image_path: str, question: Optional[str] = None) -> str:
125
- output = ""
126
- add_note = False
127
- if not question:
128
- add_note = True
129
- question = "Please write a detailed caption for this image."
130
- try:
131
- output = process_images_and_text(image_path, question, self.client)
132
- except Exception as e:
133
- print(e)
134
- if "Payload Too Large" in str(e):
135
- new_image_path = resize_image(image_path)
136
- output = process_images_and_text(new_image_path, question, self.client)
137
-
138
- if add_note:
139
- output = (
140
- f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
141
- )
142
-
143
- return output
144
-
145
-
146
- @tool
147
- def visualizer(image_path: str, question: Optional[str] = None) -> str:
148
- """A tool that can answer questions about attached images.
149
-
150
- Args:
151
- image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
152
- question: The question to answer.
153
- """
154
-
155
- add_note = False
156
- if not question:
157
- add_note = True
158
- question = "Please write a detailed caption for this image."
159
- if not isinstance(image_path, str):
160
- raise Exception("You should provide at least `image_path` string argument to this tool!")
161
-
162
- mime_type, _ = mimetypes.guess_type(image_path)
163
- base64_image = encode_image(image_path)
164
-
165
- payload = {
166
- "model": "gpt-4o",
167
- "messages": [
168
- {
169
- "role": "user",
170
- "content": [
171
- {"type": "text", "text": question},
172
- {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
173
- ],
174
- }
175
- ],
176
- "max_tokens": 1000,
177
- }
178
- response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
179
- try:
180
- output = response.json()["choices"][0]["message"]["content"]
181
- except Exception:
182
- raise Exception(f"Response format unexpected: {response.json()}")
183
-
184
- if add_note:
185
- output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
186
-
187
- return output