commited on
Browse files- .gitattributes +2 -0
- AbstractGenerator.ipynb +434 -0
- AbstractGenerator/TrainigData/en.txt +3 -0
- AbstractGenerator/TrainigData/es.txt +3 -0
- AbstractGenerator/weights/run1/encoder.json +3 -0
- AbstractGenerator/weights/run1/events.out.tfevents.1648184225.FRANZ96521-W11 +3 -0
- AbstractGenerator/weights/run1/events.out.tfevents.1648184499.FRANZ96521-W11 +3 -0
- AbstractGenerator/weights/run1/events.out.tfevents.1648229481.FRANZ96521-W11 +3 -0
- AbstractGenerator/weights/run1/hparams.json +3 -0
- AbstractGenerator/weights/run1/vocab.bpe +3 -0
- Descarga.ipynb +278 -0
- PDF_a_TXT.ipynb +105 -0
- models/124M/checkpoint +3 -0
- models/124M/encoder.json +3 -0
- models/124M/hparams.json +3 -0
- models/124M/ +3 -0
- models/124M/model.ckpt.index +3 -0
- models/124M/model.ckpt.meta +3 -0
- models/124M/vocab.bpe +3 -0
- txt_to_csv.ipynb +662 -0
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
28 |
models/** filter=lfs diff=lfs merge=lfs -text
29 |
AbstractGenerator/** filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,434 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 1,
6 |
"metadata": {},
7 |
"outputs": [
8 |
9 |
"name": "stdout",
10 |
"output_type": "stream",
11 |
"text": [
12 |
"WARNING:tensorflow:From C:\\Users\\franz\\AppData\\Local\\Temp\\ipykernel_14092\\ is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.\n",
13 |
"Instructions for updating:\n",
14 |
"Use `tf.config.list_physical_devices('GPU')` instead.\n",
15 |
"GPU is available\n"
16 |
17 |
18 |
19 |
"source": [
20 |
21 |
"import gpt_2_simple as gpt2\n",
22 |
"import os\n",
23 |
"import tensorflow as tf\n",
24 |
"import pandas as pd\n",
25 |
"import re\n",
26 |
"print(\"GPU is\", \"available\" if tf.test.is_gpu_available() else \"NOT AVAILABLE\")"
27 |
28 |
29 |
30 |
"cell_type": "code",
31 |
"execution_count": 2,
32 |
"metadata": {},
33 |
"outputs": [],
34 |
"source": [
35 |
"model_name = \"124M\"\n",
36 |
"if not os.path.isdir(os.path.join(\"models\", model_name)):\n",
37 |
"\tprint(f\"Downloading {model_name} model...\")\n",
38 |
"\tgpt2.download_gpt2(model_name=model_name) "
39 |
40 |
41 |
42 |
"cell_type": "code",
43 |
"execution_count": 3,
44 |
"metadata": {},
45 |
"outputs": [],
46 |
"source": [
47 |
"path = 'AbstractGenerator/'\n",
48 |
"checkpoint_dir =path+'weights/'\n",
49 |
"data_path = path+'TrainigData/'\n",
50 |
51 |
52 |
53 |
"file_name_en = 'en'\n",
54 |
"file_path_en = data_path+file_name_en\n",
55 |
56 |
"file_name_es = 'es'\n",
57 |
"file_path_es = data_path+file_name_es\n",
58 |
59 |
60 |
"prefix= '<|startoftext|>'\n",
61 |
"sufix ='<|endoftext|>'"
62 |
63 |
64 |
65 |
"cell_type": "markdown",
66 |
"metadata": {},
67 |
"source": [
68 |
"# create trainig data"
69 |
70 |
71 |
72 |
"cell_type": "code",
73 |
"execution_count": 13,
74 |
"metadata": {},
75 |
"outputs": [],
76 |
"source": [
77 |
"en = pd.read_csv('CSV\\scientific_paper_en.csv')[0:1000]\n",
78 |
"es = pd.read_csv('CSV\\scientific_paper_es.csv')[0:1000]"
79 |
80 |
81 |
82 |
"cell_type": "code",
83 |
"execution_count": 14,
84 |
"metadata": {},
85 |
"outputs": [],
86 |
"source": [
87 |
"import codecs\n",
88 |
"def createTrainingData(ds,fileName= 'resumen.txt' ,path ='TrainigData/'):\n",
89 |
" with,'a','utf-8') as f:\n",
90 |
" for i in ds.index:\n",
91 |
" f.write(prefix+\"\\n\")\n",
92 |
" f.write(ds.iloc[i]['text_no_abstract'])\n",
93 |
" f.write(\"ABSTRACT\\n\")\n",
94 |
" f.write(ds.iloc[i]['abstract']+\"\\n\")\n",
95 |
" f.write(sufix)\n",
96 |
" "
97 |
98 |
99 |
100 |
"cell_type": "code",
101 |
"execution_count": 15,
102 |
"metadata": {},
103 |
"outputs": [],
104 |
"source": [
105 |
106 |
107 |
108 |
109 |
110 |
"cell_type": "markdown",
111 |
"metadata": {},
112 |
"source": [
113 |
"# pretrained"
114 |
115 |
116 |
117 |
"cell_type": "code",
118 |
"execution_count": null,
119 |
"metadata": {},
120 |
"outputs": [],
121 |
"source": [
122 |
"sess = gpt2.start_tf_sess()\n",
123 |
124 |
125 |
126 |
127 |
"cell_type": "markdown",
128 |
"metadata": {},
129 |
"source": [
130 |
"# train "
131 |
132 |
133 |
134 |
"cell_type": "code",
135 |
"execution_count": 16,
136 |
"metadata": {},
137 |
"outputs": [],
138 |
"source": [
139 |
140 |
"sess = gpt2.start_tf_sess()"
141 |
142 |
143 |
144 |
"cell_type": "markdown",
145 |
"metadata": {},
146 |
"source": [
147 |
"## en"
148 |
149 |
150 |
151 |
"cell_type": "code",
152 |
"execution_count": null,
153 |
"metadata": {},
154 |
"outputs": [],
155 |
"source": [
156 |
157 |
" file_path_en+'.txt',\n",
158 |
" model_name=model_name,\n",
159 |
" checkpoint_dir=checkpoint_dir, \n",
160 |
" steps=1000\n",
161 |
" ) "
162 |
163 |
164 |
165 |
"cell_type": "markdown",
166 |
"metadata": {},
167 |
"source": [
168 |
"## es"
169 |
170 |
171 |
172 |
"cell_type": "code",
173 |
"execution_count": 17,
174 |
"metadata": {},
175 |
"outputs": [
176 |
177 |
"name": "stdout",
178 |
"output_type": "stream",
179 |
"text": [
180 |
"Loading checkpoint models\\124M\\model.ckpt\n",
181 |
"INFO:tensorflow:Restoring parameters from models\\124M\\model.ckpt\n",
182 |
"Loading dataset...\n"
183 |
184 |
185 |
186 |
"name": "stderr",
187 |
"output_type": "stream",
188 |
"text": [
189 |
"100%|██████████| 1/1 [00:51<00:00, 51.03s/it]\n"
190 |
191 |
192 |
193 |
"name": "stdout",
194 |
"output_type": "stream",
195 |
"text": [
196 |
"dataset has 17511492 tokens\n",
197 |
198 |
199 |
200 |
201 |
"ename": "ResourceExhaustedError",
202 |
"evalue": "Graph execution error:\n\nfailed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.",
203 |
"output_type": "error",
204 |
"traceback": [
205 |
206 |
"\u001b[1;31mResourceExhaustedError\u001b[0m Traceback (most recent call last)",
207 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;36mBaseSession._do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1376</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1377</a>\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1378</a>\u001b[0m \u001b[39mexcept\u001b[39;00m errors\u001b[39m.\u001b[39mOpError \u001b[39mas\u001b[39;00m e:\n",
208 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;36mBaseSession._do_run.<locals>._run_fn\u001b[1;34m(feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1359</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_extend_graph()\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1360</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_tf_sessionrun(options, feed_dict, fetch_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1361</a>\u001b[0m target_list, run_metadata)\n",
209 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;36mBaseSession._call_tf_sessionrun\u001b[1;34m(self, options, feed_dict, fetch_list, target_list, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1451</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_call_tf_sessionrun\u001b[39m(\u001b[39mself\u001b[39m, options, feed_dict, fetch_list, target_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1452</a>\u001b[0m run_metadata):\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1453</a>\u001b[0m \u001b[39mreturn\u001b[39;00m tf_session\u001b[39m.\u001b[39;49mTF_SessionRun_wrapper(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_session, options, feed_dict,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1454</a>\u001b[0m fetch_list, target_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1455</a>\u001b[0m run_metadata)\n",
210 |
"\u001b[1;31mResourceExhaustedError\u001b[0m: failed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.\n",
211 |
"\nDuring handling of the above exception, another exception occurred:\n",
212 |
"\u001b[1;31mResourceExhaustedError\u001b[0m Traceback (most recent call last)",
213 |
"\u001b[1;32mc:\\Users\\franz\\OneDrive\\Documentos\\GitHub\\Generador-de-abstracts\\AbstractGenerator.ipynb Cell 15'\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=0'>1</a>\u001b[0m gpt2\u001b[39m.\u001b[39;49mfinetune(sess,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=1'>2</a>\u001b[0m file_path_es\u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m.txt\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=2'>3</a>\u001b[0m model_name\u001b[39m=\u001b[39;49mmodel_name,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=3'>4</a>\u001b[0m checkpoint_dir\u001b[39m=\u001b[39;49mcheckpoint_dir, \n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=4'>5</a>\u001b[0m steps\u001b[39m=\u001b[39;49m\u001b[39m1000\u001b[39;49m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=5'>6</a>\u001b[0m )\n",
214 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\gpt_2_simple\\\u001b[0m, in \u001b[0;36mfinetune\u001b[1;34m(sess, dataset, steps, model_name, model_dir, combine, batch_size, learning_rate, accumulate_gradients, restore_from, run_name, checkpoint_dir, sample_every, sample_length, sample_num, multi_gpu, save_every, print_every, max_checkpoints, use_memory_saving_gradients, only_train_transformer_layers, optimizer, overwrite, reuse)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/'>337</a>\u001b[0m sess\u001b[39m.\u001b[39mrun(opt_reset)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/'>338</a>\u001b[0m \u001b[39mfor\u001b[39;00m _ \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(accumulate_gradients):\n\u001b[1;32m--> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/'>339</a>\u001b[0m sess\u001b[39m.\u001b[39;49mrun(\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/'>340</a>\u001b[0m opt_compute, feed_dict\u001b[39m=\u001b[39;49m{context: sample_batch()})\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/'>341</a>\u001b[0m (v_loss, v_summary) \u001b[39m=\u001b[39m sess\u001b[39m.\u001b[39mrun((opt_apply, summary_loss))\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/'>342</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n",
215 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>964</a>\u001b[0m run_metadata_ptr \u001b[39m=\u001b[39m tf_session\u001b[39m.\u001b[39mTF_NewBuffer() \u001b[39mif\u001b[39;00m run_metadata \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>966</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>967</a>\u001b[0m result \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run(\u001b[39mNone\u001b[39;49;00m, fetches, feed_dict, options_ptr,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>968</a>\u001b[0m run_metadata_ptr)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>969</a>\u001b[0m \u001b[39mif\u001b[39;00m run_metadata:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>970</a>\u001b[0m proto_data \u001b[39m=\u001b[39m tf_session\u001b[39m.\u001b[39mTF_GetBuffer(run_metadata_ptr)\n",
216 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;36mBaseSession._run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1187</a>\u001b[0m \u001b[39m# We only want to really perform the run if fetches or targets are provided,\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1188</a>\u001b[0m \u001b[39m# or if the call is a partial run that specifies feeds.\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1189</a>\u001b[0m \u001b[39mif\u001b[39;00m final_fetches \u001b[39mor\u001b[39;00m final_targets \u001b[39mor\u001b[39;00m (handle \u001b[39mand\u001b[39;00m feed_dict_tensor):\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1190</a>\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_do_run(handle, final_targets, final_fetches,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1191</a>\u001b[0m feed_dict_tensor, options, run_metadata)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1192</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1193</a>\u001b[0m results \u001b[39m=\u001b[39m []\n",
217 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;36mBaseSession._do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1367</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_call_tf_sessionprun(handle, feed_dict, fetch_list)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1369</a>\u001b[0m \u001b[39mif\u001b[39;00m handle \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1370</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_do_call(_run_fn, feeds, fetches, targets, options,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1371</a>\u001b[0m run_metadata)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1372</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1373</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_do_call(_prun_fn, handle, feeds, fetches)\n",
218 |
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\\u001b[0m, in \u001b[0;36mBaseSession._do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1391</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m'\u001b[39m\u001b[39monly supports NHWC tensor format\u001b[39m\u001b[39m'\u001b[39m \u001b[39min\u001b[39;00m message:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1392</a>\u001b[0m message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mA possible workaround: Try disabling Grappler optimizer\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1393</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mby modifying the config for creating the session eg.\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1394</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39msession_config.graph_options.rewrite_options.\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1395</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39mdisable_meta_optimizer = True\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/'>1396</a>\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mtype\u001b[39m(e)(node_def, op, message)\n",
219 |
"\u001b[1;31mResourceExhaustedError\u001b[0m: Graph execution error:\n\nfailed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode."
220 |
221 |
222 |
223 |
"source": [
224 |
225 |
" file_path_es+'.txt',\n",
226 |
" model_name=model_name,\n",
227 |
" checkpoint_dir=checkpoint_dir, \n",
228 |
" steps=1000\n",
229 |
" ) "
230 |
231 |
232 |
233 |
"cell_type": "markdown",
234 |
"metadata": {},
235 |
"source": [
236 |
"# test"
237 |
238 |
239 |
240 |
"cell_type": "markdown",
241 |
"metadata": {},
242 |
"source": [
243 |
"## en "
244 |
245 |
246 |
247 |
"cell_type": "code",
248 |
"execution_count": null,
249 |
"metadata": {},
250 |
"outputs": [],
251 |
"source": [
252 |
"text = \"\"\"Introduction and preliminaries\n",
253 |
"The focus of this paper is decompositions of (k, `)-sparse graphs into edge-disjoint subgraphs\n",
254 |
"that certify sparsity. We use graph to mean a multigraph, possibly with loops. We say that a\n",
255 |
"graph is (k, `)-sparse if no subset of n′ vertices spans more than kn′− ` edges in the graph; a\n",
256 |
"(k, `)-sparse graph with kn′− ` edges is (k, `)-tight. We call the range k ≤ `≤ 2k−1 the upper\n",
257 |
"range of sparse graphs and 0≤ `≤ k the lower range.\n",
258 |
"In this paper, we present efficient algorithms for finding decompositions that certify sparsity\n",
259 |
"in the upper range of `. Our algorithms also apply in the lower range, which was already ad-\n",
260 |
"dressed by [3, 4, 5, 6, 19]. A decomposition certifies the sparsity of a graph if the sparse graphs\n",
261 |
"and graphs admitting the decomposition coincide.\n",
262 |
"Our algorithms are based on a new characterization of sparse graphs, which we call the\n",
263 |
"pebble game with colors. The pebble game with colors is a simple graph construction rule that\n",
264 |
"produces a sparse graph along with a sparsity-certifying decomposition.\n",
265 |
"We define and study a canonical class of pebble game constructions, which correspond to\n",
266 |
"previously studied decompositions of sparse graphs into edge disjoint trees. Our results provide\n",
267 |
"a unifying framework for all the previously known special cases, including Nash-Williams-\n",
268 |
"Tutte and [7, 24]. Indeed, in the lower range, canonical pebble game constructions capture the\n",
269 |
"properties of the augmenting paths used in matroid union and intersection algorithms[5, 6].\n",
270 |
"Since the sparse graphs in the upper range are not known to be unions or intersections of the\n",
271 |
"matroids for which there are efficient augmenting path algorithms, these do not easily apply in\n",
272 |
"∗ Research of both authors funded by the NSF under grants NSF CCF-0430990 and NSF-DARPA CARGO\n",
273 |
"CCR-0310661 to the first author.\n",
274 |
"2 Ileana Streinu, Louis Theran\n",
275 |
"Term Meaning\n",
276 |
"Sparse graph G Every non-empty subgraph on n′ vertices has ≤ kn′− ` edges\n",
277 |
"Tight graph G G = (V,E) is sparse and |V |= n, |E|= kn− `\n",
278 |
"Block H in G G is sparse, and H is a tight subgraph\n",
279 |
"Component H of G G is sparse and H is a maximal block\n",
280 |
"Map-graph Graph that admits an out-degree-exactly-one orientation\n",
281 |
"(k, `)-maps-and-trees Edge-disjoint union of ` trees and (k− `) map-grpahs\n",
282 |
"`Tk Union of ` trees, each vertex is in exactly k of them\n",
283 |
"Set of tree-pieces of an `Tk induced on V ′ ⊂V Pieces of trees in the `Tk spanned by E(V ′)\n",
284 |
"Proper `Tk Every V ′ ⊂V contains ≥ ` pieces of trees from the `Tk\n",
285 |
"Table 1. Sparse graph and decomposition terminology used in this paper.\n",
286 |
"the upper range. Pebble game with colors constructions may thus be considered a strengthening\n",
287 |
"of augmenting paths to the upper range of matroidal sparse graphs.\n",
288 |
"1.1. Sparse graphs\n",
289 |
290 |
291 |
292 |
293 |
294 |
295 |
"cell_type": "code",
296 |
"execution_count": null,
297 |
"metadata": {},
298 |
"outputs": [],
299 |
"source": [
300 |
301 |
302 |
303 |
304 |
"cell_type": "markdown",
305 |
"metadata": {},
306 |
"source": [
307 |
"## es"
308 |
309 |
310 |
311 |
"cell_type": "code",
312 |
"execution_count": null,
313 |
"metadata": {},
314 |
"outputs": [],
315 |
"source": [
316 |
"text = \"\"\"El foco de este documento son las descomposicións de (k, `)-sparse gráficos en bordes-disjunto subgraphs\n",
317 |
"que certifique la escasez. Usamos el gráfico para significar un múltiplo, posiblemente con bucles. Nosotros decimos que un\n",
318 |
"grafo es (k, `)-sparse si ningún subconjunto de n′ vértices abarca más de kn ` bordes en el gráfico; a\n",
319 |
"(k, `)-sparse gráfico con kn ` bordes es (k, `)-estrechado. Llamamos al rango k ≤ 2k−1 el superior\n",
320 |
"rango de gráficos escasos y 0≤ k el rango inferior.\n",
321 |
"En este artículo, presentamos algoritmos eficientes para encontrar descomposicións que certifiquen la escasez\n",
322 |
"en el rango superior de `. Nuestros algoritmos también se aplican en el rango inferior, que ya era ad-\n",
323 |
"vestido por [3, 4, 5, 6, 19]. Una descomposición certifica la escasez de un gráfico si los gráficos dispersos\n",
324 |
"y los gráficos que admiten la descomposición coinciden.\n",
325 |
"Nuestros algoritmos se basan en una nueva caracterización de gráficos escasos, que llamamos el\n",
326 |
"juego de guijarros con colores. El juego de guijarros con colores es una regla de construcción de gráficos simples que\n",
327 |
"produce un gráfico escaso junto con una descomposición certificadora de la escasez.\n",
328 |
"Definimos y estudiamos una clase canónica de construcciones de juego de guijarros, que corresponden a\n",
329 |
"previamente estudiado las descomposiciones de los gráficos escasos en los árboles disjuntos del borde. Nuestros resultados proporcionan\n",
330 |
"un marco unificador para todos los casos especiales conocidos anteriormente, incluidos Nash-Williams-\n",
331 |
"Tutte y [7, 24]. De hecho, en el rango inferior, las construcciones canónicas de juego de guijarros capturan la\n",
332 |
"propiedades de las rutas de aumento utilizadas en los algoritmos de unión de matroides y de intersección[5, 6].\n",
333 |
"Dado que los gráficos escasos en el rango superior no se sabe que son uniones o intersecciones de la\n",
334 |
"matroides para los que hay algoritmos de ruta de aumento eficiente, estos no se aplican fácilmente en\n",
335 |
"* Investigación de ambos autores financiada por la NSF bajo subvenciones NSF CCF-0430990 y NSF-DARPA CARGO\n",
336 |
"CCR-0310661 al primer autor.\n",
337 |
"2 Ileana Streinu, Louis Theran\n",
338 |
"Significado del término\n",
339 |
"Gráfico escaso G Cada subgrafo no vacío en n′ vértices tiene ≤ kn ` bordes\n",
340 |
"El gráfico ajustado G G = (V,E) es escaso y V = n, E= kn− `\n",
341 |
"El bloque H en G G es escaso, y H es un subgrafo apretado\n",
342 |
"El componente H de G G es escaso y H es un bloqueo máximo\n",
343 |
"Gráfico cartográfico que admite una orientación de grado-exactamente-uno\n",
344 |
"(k, `)-maps-and-trees Edge-disjunt union de ` árboles y (k- `) map-grpahs\n",
345 |
"`Tk Unión de ` árboles, cada vértice está exactamente en k de ellos\n",
346 |
"Conjunto de piezas arbóreas de un `Tk inducido en V ′ ́V Piezas de árboles en el `Tk extendido por E(V ′)\n",
347 |
"`Tk Apropiado Cada V ′ V contiene ≥ ` pedazos de árboles de la `Tk\n",
348 |
"Cuadro 1 Gráfico escaso y terminología de descomposición utilizada en este artículo.\n",
349 |
"el rango superior. Pebble juego con construcciones de colores por lo tanto puede ser considerado un fortalecimiento\n",
350 |
"de caminos de aumento a la gama superior de gráficos de la escasez matroidal.\n",
351 |
"1.1. Gráficos escasos\n",
352 |
"Un gráfico es (k, `)-sparse si para cualquier subgrafo no vacío con bordes m′ y n′ vértices, m′ ≤\n",
353 |
"kn `. Observamos que esta condición implica que 0 ≤ ` ≤ 2k− 1, y a partir de ahora en este\n",
354 |
"Haremos esta suposición. Un gráfico escaso que tiene n vértices y exactamente bordes kn\n",
355 |
"se llama apretado.\n",
356 |
"Para un gráfico G = (V,E), y V ′ V, utilizamos el intervalo de notación (V ′) para el número de bordes\n",
357 |
"en el subgráfico inducido por V ′. En un gráfico dirigido, out(V ′) es el número de bordes con la cola\n",
358 |
"en V ′ y la cabeza en V −V ′; para un subgráfico inducido por V ′, llamamos a tal borde un borde superior.\n",
359 |
"Hay dos tipos importantes de subgrafías de gráficos escasos. Un bloque es un subgrafo apretado de\n",
360 |
"un gráfico escaso. Un componente es un bloque máximo.\n",
361 |
"La Tabla 1 resume la escasa terminología gráfica utilizada en este artículo.\n",
362 |
"1.2. Descomposiciónes de certificación de la sparsidad\n",
363 |
"Un k-arborescencia es un gráfico que admite una descomposición en k borde-desjunto que abarca los árboles.\n",
364 |
"La Figura 1(a) muestra un ejemplo de una 3-arborescencia. Se describen los gráficos k-arborescentes\n",
365 |
"por los conocidos teoremas de Tutte [23] y Nash-Williams [17] como exactamente el (k,k) apretado\n",
366 |
367 |
368 |
369 |
370 |
371 |
372 |
"cell_type": "code",
373 |
"execution_count": null,
374 |
"metadata": {},
375 |
"outputs": [],
376 |
"source": [
377 |
378 |
379 |
380 |
381 |
"cell_type": "markdown",
382 |
"metadata": {},
383 |
"source": [
384 |
"# gradio interface"
385 |
386 |
387 |
388 |
"cell_type": "code",
389 |
"execution_count": null,
390 |
"metadata": {},
391 |
"outputs": [],
392 |
"source": [
393 |
"def generateAbstract(text):\n",
394 |
" # with tf.compat.v1.variable_scope(\"weight\", reuse = True):\n",
395 |
" #sess = tf.compat.v1.get_variable('sess',gpt2.start_tf_sess())\n",
396 |
" tf.compat.v1.reset_default_graph()\n",
397 |
" sess = gpt2.start_tf_sess()\n",
398 |
" gpt2.load_gpt2(sess,checkpoint_dir=checkpoint_dir,run_name='run1')\n",
399 |
" txt = gpt2.generate(sess,prefix=str(text)+\"\\nABSTRACT\", return_as_list=True,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)[0]\n",
400 |
" return str(txt[txt.find('ABSTRACT'):])\n",
401 |
402 |
403 |
404 |
"iface = gr.Interface(fn=generateAbstract, inputs=gr.inputs.Textbox(lines=10, placeholder=\"text\"), outputs=\"textbox\")\n",
405 |
"iface.launch(debug = True )"
406 |
407 |
408 |
409 |
"metadata": {
410 |
"interpreter": {
411 |
"hash": "53fbdc69e3e12c371950068c144423682c30d04ec68c2bd46937202e33e0058d"
412 |
413 |
"kernelspec": {
414 |
"display_name": "Python 3.7.11 ('receta')",
415 |
"language": "python",
416 |
"name": "python3"
417 |
418 |
"language_info": {
419 |
"codemirror_mode": {
420 |
"name": "ipython",
421 |
"version": 3
422 |
423 |
"file_extension": ".py",
424 |
"mimetype": "text/x-python",
425 |
"name": "python",
426 |
"nbconvert_exporter": "python",
427 |
"pygments_lexer": "ipython3",
428 |
"version": "3.9.7"
429 |
430 |
"orig_nbformat": 4
431 |
432 |
"nbformat": 4,
433 |
"nbformat_minor": 2
434 |
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:814f983aa49ccc33a993a7d12f67a2eb2a7ca0b15d8697e82b50d3a19f3e1595
3 |
size 35400974
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:2454067cfe384e1d824b3f5d29cb5c4e1ff292289ad4b37c6cbd22f5cc715295
3 |
size 44460970
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
3 |
size 1042301
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:83a88ba7f3268f11289fb24fd13db1367b91acce6466c4ad394011e10ea4c304
3 |
size 82
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:cb8646e6bf1e1b8cc26f8128ec4e4c2e797dac297939450a8bf46057e7388a6a
3 |
size 82
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:04d1f71db542da83fee4fe8574bf382cb5324b6decef506206250b8fea85abd0
3 |
size 82
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:d9d56e4121c427164e0c55c6f03c08e1daf9002b9b672825112d19097b680318
3 |
size 90
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
3 |
size 456318
@@ -0,0 +1,278 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 2,
6 |
"metadata": {},
7 |
"outputs": [],
8 |
"source": [
9 |
"import pandas as pd\n",
10 |
"import json\n",
11 |
"from pandas import json_normalize\n",
12 |
"import requests\n",
13 |
"from pathlib import Path\n",
14 |
"from multiprocessing.pool import ThreadPool as Pool\n",
15 |
"import codecs\n",
16 |
"import random\n",
17 |
"import re"
18 |
19 |
20 |
21 |
"cell_type": "code",
22 |
"execution_count": 3,
23 |
"metadata": {},
24 |
"outputs": [],
25 |
"source": [
26 |
"URL_BASE = \"\"\n",
27 |
"PDF_PATH = 'PDF'\n",
28 |
29 |
30 |
31 |
32 |
"cell_type": "markdown",
33 |
"metadata": {},
34 |
"source": [
35 |
"# Arxiv\n"
36 |
37 |
38 |
39 |
"cell_type": "code",
40 |
"execution_count": 4,
41 |
"metadata": {},
42 |
"outputs": [
43 |
44 |
"name": "stdout",
45 |
"output_type": "stream",
46 |
"text": [
47 |
"<class 'pandas.core.frame.DataFrame'>\n"
48 |
49 |
50 |
51 |
"data": {
52 |
"text/html": [
53 |
54 |
"<style scoped>\n",
55 |
" .dataframe tbody tr th:only-of-type {\n",
56 |
" vertical-align: middle;\n",
57 |
" }\n",
58 |
59 |
" .dataframe tbody tr th {\n",
60 |
" vertical-align: top;\n",
61 |
" }\n",
62 |
63 |
" .dataframe thead th {\n",
64 |
" text-align: right;\n",
65 |
" }\n",
66 |
67 |
"<table border=\"1\" class=\"dataframe\">\n",
68 |
" <thead>\n",
69 |
" <tr style=\"text-align: right;\">\n",
70 |
" <th></th>\n",
71 |
" <th>id</th>\n",
72 |
" <th>title</th>\n",
73 |
" <th>abstract</th>\n",
74 |
" <th>Text</th>\n",
75 |
" </tr>\n",
76 |
" </thead>\n",
77 |
" <tbody>\n",
78 |
" <tr>\n",
79 |
" <th>0</th>\n",
80 |
" <td>0704.0001</td>\n",
81 |
" <td>Calculation of prompt diphoton production cros...</td>\n",
82 |
" <td>A fully differential calculation in perturba...</td>\n",
83 |
" <td></td>\n",
84 |
" </tr>\n",
85 |
" <tr>\n",
86 |
" <th>1</th>\n",
87 |
" <td>0704.0002</td>\n",
88 |
" <td>Sparsity-certifying Graph Decompositions</td>\n",
89 |
" <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
90 |
" <td></td>\n",
91 |
" </tr>\n",
92 |
" <tr>\n",
93 |
" <th>2</th>\n",
94 |
" <td>0704.0003</td>\n",
95 |
" <td>The evolution of the Earth-Moon system based o...</td>\n",
96 |
" <td>The evolution of Earth-Moon system is descri...</td>\n",
97 |
" <td></td>\n",
98 |
" </tr>\n",
99 |
" <tr>\n",
100 |
" <th>3</th>\n",
101 |
" <td>0704.0004</td>\n",
102 |
" <td>A determinant of Stirling cycle numbers counts...</td>\n",
103 |
" <td>We show that a determinant of Stirling cycle...</td>\n",
104 |
" <td></td>\n",
105 |
" </tr>\n",
106 |
" <tr>\n",
107 |
" <th>4</th>\n",
108 |
" <td>0704.0005</td>\n",
109 |
" <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
110 |
" <td>In this paper we show how to compute the $\\L...</td>\n",
111 |
" <td></td>\n",
112 |
" </tr>\n",
113 |
" <tr>\n",
114 |
" <th>...</th>\n",
115 |
" <td>...</td>\n",
116 |
" <td>...</td>\n",
117 |
" <td>...</td>\n",
118 |
" <td>...</td>\n",
119 |
" </tr>\n",
120 |
" <tr>\n",
121 |
" <th>1996</th>\n",
122 |
" <td>0704.1997</td>\n",
123 |
" <td>Query on Negative Temperature, Internal Intera...</td>\n",
124 |
" <td>After negative temperature is restated, we f...</td>\n",
125 |
" <td></td>\n",
126 |
" </tr>\n",
127 |
" <tr>\n",
128 |
" <th>1997</th>\n",
129 |
" <td>0704.1998</td>\n",
130 |
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
131 |
" <td>A scale invariant model containing dilaton $...</td>\n",
132 |
" <td></td>\n",
133 |
" </tr>\n",
134 |
" <tr>\n",
135 |
" <th>1998</th>\n",
136 |
" <td>0704.1999</td>\n",
137 |
" <td>Dark matter caustics and the enhancement of se...</td>\n",
138 |
" <td>Cold dark matter haloes are populated by cau...</td>\n",
139 |
" <td></td>\n",
140 |
" </tr>\n",
141 |
" <tr>\n",
142 |
" <th>1999</th>\n",
143 |
" <td>0704.2000</td>\n",
144 |
" <td>Search for a Higgs boson produced in associati...</td>\n",
145 |
" <td>We describe a search for the standard model ...</td>\n",
146 |
" <td></td>\n",
147 |
" </tr>\n",
148 |
" <tr>\n",
149 |
" <th>2000</th>\n",
150 |
" <td>0704.2001</td>\n",
151 |
" <td>Geometry of Parallelizable Manifolds in the Co...</td>\n",
152 |
" <td>In this paper, we deal with a generalization...</td>\n",
153 |
" <td></td>\n",
154 |
" </tr>\n",
155 |
" </tbody>\n",
156 |
157 |
"<p>2001 rows × 4 columns</p>\n",
158 |
159 |
160 |
"text/plain": [
161 |
" id title \\\n",
162 |
"0 0704.0001 Calculation of prompt diphoton production cros... \n",
163 |
"1 0704.0002 Sparsity-certifying Graph Decompositions \n",
164 |
"2 0704.0003 The evolution of the Earth-Moon system based o... \n",
165 |
"3 0704.0004 A determinant of Stirling cycle numbers counts... \n",
166 |
"4 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
167 |
"... ... ... \n",
168 |
"1996 0704.1997 Query on Negative Temperature, Internal Intera... \n",
169 |
"1997 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
170 |
"1998 0704.1999 Dark matter caustics and the enhancement of se... \n",
171 |
"1999 0704.2000 Search for a Higgs boson produced in associati... \n",
172 |
"2000 0704.2001 Geometry of Parallelizable Manifolds in the Co... \n",
173 |
174 |
" abstract Text \n",
175 |
"0 A fully differential calculation in perturba... \n",
176 |
"1 We describe a new algorithm, the $(k,\\ell)$-... \n",
177 |
"2 The evolution of Earth-Moon system is descri... \n",
178 |
"3 We show that a determinant of Stirling cycle... \n",
179 |
"4 In this paper we show how to compute the $\\L... \n",
180 |
"... ... ... \n",
181 |
"1996 After negative temperature is restated, we f... \n",
182 |
"1997 A scale invariant model containing dilaton $... \n",
183 |
"1998 Cold dark matter haloes are populated by cau... \n",
184 |
"1999 We describe a search for the standard model ... \n",
185 |
"2000 In this paper, we deal with a generalization... \n",
186 |
187 |
"[2001 rows x 4 columns]"
188 |
189 |
190 |
"execution_count": 4,
191 |
"metadata": {},
192 |
"output_type": "execute_result"
193 |
194 |
195 |
"source": [
196 |
"data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n",
197 |
"df = None\n",
198 |
"for i in data:\n",
199 |
" df = i \n",
200 |
" print(type(i))\n",
201 |
" break\n",
202 |
"df = df[['id','title','abstract']]\n",
203 |
"df.insert(3, \"Text\", \"\") \n",
204 |
205 |
206 |
207 |
208 |
"cell_type": "code",
209 |
"execution_count": 7,
210 |
"metadata": {},
211 |
"outputs": [],
212 |
"source": [
213 |
"def GetFileURL(file_id):\n",
214 |
" url = URL_BASE+file_id\n",
215 |
" r = requests.get(url, stream=True) \n",
216 |
" filename = Path(PDF_PATH+'/'+file_id+'.pdf')\n",
217 |
" response = requests.get(url)\n",
218 |
" filename.write_bytes(response.content)"
219 |
220 |
221 |
222 |
"cell_type": "code",
223 |
"execution_count": 35,
224 |
"metadata": {},
225 |
"outputs": [],
226 |
"source": [
227 |
"pool_size = 16 \n",
228 |
"def worker(file):\n",
229 |
" try:\n",
230 |
" GetFileURL(file)\n",
231 |
" except:\n",
232 |
" print('error with item '+ file)\n",
233 |
" try:\n",
234 |
" with'/log.txt', 'a') as the_file: \n",
235 |
" the_file.writelines(str(file)+\"\\n\")\n",
236 |
" except:\n",
237 |
" print('error en log '+ file)\n",
238 |
"def get_ids(iteracion,batch=100): \n",
239 |
" inicio = int(iteracion*batch)\n",
240 |
" filesId = data[inicio :inicio + batch]['id']\n",
241 |
" return filesId\n",
242 |
243 |
"pool = Pool(pool_size)\n",
244 |
"filesId = get_ids(19)\n",
245 |
"for file in filesId:\n",
246 |
" pool.apply_async(worker, (file,))\n",
247 |
248 |
249 |
250 |
251 |
252 |
253 |
"metadata": {
254 |
"interpreter": {
255 |
"hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
256 |
257 |
"kernelspec": {
258 |
"display_name": "Python 3.9.7 ('tf-gpu')",
259 |
"language": "python",
260 |
"name": "python3"
261 |
262 |
"language_info": {
263 |
"codemirror_mode": {
264 |
"name": "ipython",
265 |
"version": 3
266 |
267 |
"file_extension": ".py",
268 |
"mimetype": "text/x-python",
269 |
"name": "python",
270 |
"nbconvert_exporter": "python",
271 |
"pygments_lexer": "ipython3",
272 |
"version": "3.9.7"
273 |
274 |
"orig_nbformat": 4
275 |
276 |
"nbformat": 4,
277 |
"nbformat_minor": 2
278 |
@@ -0,0 +1,105 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 3,
6 |
"metadata": {},
7 |
"outputs": [
8 |
9 |
"name": "stdout",
10 |
"output_type": "stream",
11 |
"text": [
12 |
"Requirement already satisfied: PyPDF2 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.26.0)\n",
13 |
"Requirement already satisfied: tika in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.24)\n",
14 |
"Requirement already satisfied: requests in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (2.27.1)\n",
15 |
"Requirement already satisfied: setuptools in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (58.0.4)\n",
16 |
"Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2.0.4)\n",
17 |
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2021.10.8)\n",
18 |
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (3.3)\n",
19 |
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (1.26.8)\n"
20 |
21 |
22 |
23 |
"source": [
24 |
"! pip install PyPDF2\n",
25 |
"! pip install tika"
26 |
27 |
28 |
29 |
"cell_type": "code",
30 |
"execution_count": 1,
31 |
"metadata": {},
32 |
"outputs": [],
33 |
"source": [
34 |
"from tika import parser\n",
35 |
"import codecs\n",
36 |
"import os"
37 |
38 |
39 |
40 |
"cell_type": "code",
41 |
"execution_count": 2,
42 |
"metadata": {},
43 |
"outputs": [],
44 |
"source": [
45 |
"def obtener_texto(file_path,store_path):\n",
46 |
" file_data = parser.from_file(file_path)\n",
47 |
" output = file_data['content']\n",
48 |
" output = output.strip() \n",
49 |
" output= output.split('\\n')\n",
50 |
" with'.txt', 'w','utf-8') as the_file: \n",
51 |
" for line in output:\n",
52 |
" #print(line)\n",
53 |
" if len(line)>4: \n",
54 |
" the_file.write(str(line)+'\\n')\n"
55 |
56 |
57 |
58 |
"cell_type": "code",
59 |
"execution_count": 3,
60 |
"metadata": {},
61 |
"outputs": [
62 |
63 |
"name": "stderr",
64 |
"output_type": "stream",
65 |
"text": [
66 |
"2022-03-17 17:02:20,018 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n"
67 |
68 |
69 |
70 |
"source": [
71 |
"PDF_PATH = 'PDF'\n",
72 |
"TXT_PATH= 'TXT'\n",
73 |
"files = os.listdir(PDF_PATH)\n",
74 |
"for file in files:\n",
75 |
" obtener_texto(PDF_PATH+'/'+file,TXT_PATH+'/'+file)\n",
76 |
" "
77 |
78 |
79 |
80 |
"metadata": {
81 |
"interpreter": {
82 |
"hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
83 |
84 |
"kernelspec": {
85 |
"display_name": "Python 3.9.7 ('tf-gpu')",
86 |
"language": "python",
87 |
"name": "python3"
88 |
89 |
"language_info": {
90 |
"codemirror_mode": {
91 |
"name": "ipython",
92 |
"version": 3
93 |
94 |
"file_extension": ".py",
95 |
"mimetype": "text/x-python",
96 |
"name": "python",
97 |
"nbconvert_exporter": "python",
98 |
"pygments_lexer": "ipython3",
99 |
"version": "3.9.7"
100 |
101 |
"orig_nbformat": 4
102 |
103 |
"nbformat": 4,
104 |
"nbformat_minor": 2
105 |
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:dd1b025d2e155283f5e300ce95bf6d5b6bc0f7fe010db73daa6975eb896ab9cb
3 |
size 77
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
3 |
size 1042301
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:d9d56e4121c427164e0c55c6f03c08e1daf9002b9b672825112d19097b680318
3 |
size 90
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:2060c885360cc0cf41d7a6dbc4d24b5127aae20260c8b5ae521b5a6578407118
3 |
size 497759232
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:71916f763f9746f9b2a06b12d91996cf1084ae008d0424543d39391c5f2dc687
3 |
size 5215
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:4668c448fa11531fd6700460487f73e82d3272960cea942252f8744bf225c77b
3 |
size 471155
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
3 |
size 456318
@@ -0,0 +1,662 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": null,
6 |
"metadata": {},
7 |
"outputs": [],
8 |
"source": [
9 |
"import pandas as pd\n",
10 |
"import os\n",
11 |
"from easynmt import EasyNMT\n"
12 |
13 |
14 |
15 |
"cell_type": "code",
16 |
"execution_count": null,
17 |
"metadata": {},
18 |
"outputs": [],
19 |
"source": [
20 |
"URL_BASE = \"\"\n",
21 |
"PDF_PATH = 'PDF'\n",
22 |
"TXT_PATH= 'TXT'\n",
23 |
24 |
25 |
26 |
27 |
"cell_type": "markdown",
28 |
"metadata": {},
29 |
"source": [
30 |
"# Get Data from TXT"
31 |
32 |
33 |
34 |
"cell_type": "code",
35 |
"execution_count": null,
36 |
"metadata": {},
37 |
"outputs": [],
38 |
"source": [
39 |
"data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n",
40 |
"df = None\n",
41 |
"for i in data:\n",
42 |
" df = i \n",
43 |
" print(type(i))\n",
44 |
" break\n",
45 |
"df = df[['id','title','abstract']]\n"
46 |
47 |
48 |
49 |
"cell_type": "code",
50 |
"execution_count": null,
51 |
"metadata": {},
52 |
"outputs": [],
53 |
"source": [
54 |
"for file in df['id']:\n",
55 |
" file_path = TXT_PATH+'/'+str(file)+'.pdf.txt'\n",
56 |
" if os.path.isfile(file_path):\n",
57 |
" with open(file_path,'r',encoding='utf8') as f:\n",
58 |
" s =str( \n",
59 |
" df.loc[df['id'] == str(file),'full_text'] = s "
60 |
61 |
62 |
63 |
"cell_type": "code",
64 |
"execution_count": null,
65 |
"metadata": {},
66 |
"outputs": [],
67 |
"source": [
68 |
"df = df.dropna()\n",
69 |
70 |
71 |
72 |
73 |
74 |
75 |
"cell_type": "markdown",
76 |
"metadata": {},
77 |
"source": [
78 |
"# first run \n"
79 |
80 |
81 |
82 |
"cell_type": "code",
83 |
"execution_count": null,
84 |
"metadata": {},
85 |
"outputs": [],
86 |
"source": [
87 |
"df = pd.read_csv(CSV_PATH +'/scientific_paper_en.csv',dtype={'id':'str'})\n",
88 |
89 |
90 |
91 |
92 |
"cell_type": "markdown",
93 |
"metadata": {},
94 |
"source": [
95 |
"# leer datos"
96 |
97 |
98 |
99 |
"cell_type": "code",
100 |
"execution_count": null,
101 |
"metadata": {},
102 |
"outputs": [],
103 |
"source": [
104 |
"df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n",
105 |
106 |
107 |
108 |
109 |
110 |
"cell_type": "markdown",
111 |
"metadata": {},
112 |
"source": [
113 |
"# translate"
114 |
115 |
116 |
117 |
"cell_type": "code",
118 |
"execution_count": null,
119 |
"metadata": {},
120 |
"outputs": [],
121 |
"source": [
122 |
"model = EasyNMT('opus-mt')"
123 |
124 |
125 |
126 |
"cell_type": "markdown",
127 |
"metadata": {},
128 |
"source": [
129 |
"## translate full text"
130 |
131 |
132 |
133 |
"cell_type": "code",
134 |
"execution_count": null,
135 |
"metadata": {},
136 |
"outputs": [],
137 |
"source": [
138 |
"max = len(df.index)\n",
139 |
"for i in range(0,1754):\n",
140 |
" text = df.iloc[i]['full_text']\n",
141 |
" translated_text = model.translate(text, target_lang='es')\n",
142 |
" df.loc[i,'translated'] = translated_text \n",
143 |
" print(\"listo documento \",i)\n",
144 |
" if(i%10==0):\n",
145 |
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
146 |
147 |
148 |
149 |
150 |
"cell_type": "markdown",
151 |
"metadata": {},
152 |
"source": [
153 |
"## translate abstract"
154 |
155 |
156 |
157 |
"cell_type": "code",
158 |
"execution_count": null,
159 |
"metadata": {},
160 |
"outputs": [],
161 |
"source": [
162 |
"max = len(df.index)\n",
163 |
"for i in range(0,1754):\n",
164 |
" text = df.iloc[i]['abstract']\n",
165 |
" translated_text = model.translate(text, target_lang='es')\n",
166 |
" df.loc[i,'translated_abstract'] = translated_text \n",
167 |
" print(\"listo documento \",i)\n",
168 |
" if(i%100==0):\n",
169 |
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
170 |
171 |
172 |
173 |
174 |
175 |
"cell_type": "markdown",
176 |
"metadata": {},
177 |
"source": [
178 |
"# remove abstract"
179 |
180 |
181 |
182 |
"cell_type": "code",
183 |
"execution_count": null,
184 |
"metadata": {},
185 |
"outputs": [],
186 |
"source": [
187 |
"max = len(df.index)-1"
188 |
189 |
190 |
191 |
"cell_type": "code",
192 |
"execution_count": null,
193 |
"metadata": {},
194 |
"outputs": [],
195 |
"source": [
196 |
"end = 'Introducción'\n",
197 |
"for i in range(0,max):\n",
198 |
" text = df.iloc[i]['translated'] \n",
199 |
" p = text.find(end)\n",
200 |
" if(p != -1): \n",
201 |
" df.loc[i,'translated_no_abstract'] = text[p:] \n",
202 |
" else:\n",
203 |
" df.loc[i,'translated_no_abstract']= text\n",
204 |
" print(\"listo documento \",i,p)\n",
205 |
" if(i%1000==0):\n",
206 |
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
207 |
208 |
209 |
210 |
211 |
"cell_type": "code",
212 |
"execution_count": null,
213 |
"metadata": {},
214 |
"outputs": [],
215 |
"source": [
216 |
"end = 'Abstract'\n",
217 |
"for i in range(0,max):\n",
218 |
" text = df.iloc[i]['full_text'] \n",
219 |
" p = text.find(end)\n",
220 |
" if(p != -1): \n",
221 |
" df.loc[i,'text_no_abstract'] = text[p:] \n",
222 |
" else:\n",
223 |
" df.loc[i,'text_no_abstract']= text \n",
224 |
" if(i%1000==0):\n",
225 |
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
226 |
227 |
228 |
229 |
230 |
"cell_type": "markdown",
231 |
"metadata": {},
232 |
"source": [
233 |
"# split data to csv"
234 |
235 |
236 |
237 |
"cell_type": "code",
238 |
"execution_count": null,
239 |
"metadata": {},
240 |
"outputs": [],
241 |
"source": [
242 |
"df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n",
243 |
244 |
245 |
246 |
247 |
"cell_type": "code",
248 |
"execution_count": 77,
249 |
"metadata": {},
250 |
"outputs": [
251 |
252 |
"data": {
253 |
"text/html": [
254 |
255 |
"<style scoped>\n",
256 |
" .dataframe tbody tr th:only-of-type {\n",
257 |
" vertical-align: middle;\n",
258 |
" }\n",
259 |
260 |
" .dataframe tbody tr th {\n",
261 |
" vertical-align: top;\n",
262 |
" }\n",
263 |
264 |
" .dataframe thead th {\n",
265 |
" text-align: right;\n",
266 |
" }\n",
267 |
268 |
"<table border=\"1\" class=\"dataframe\">\n",
269 |
" <thead>\n",
270 |
" <tr style=\"text-align: right;\">\n",
271 |
" <th></th>\n",
272 |
" <th>id</th>\n",
273 |
" <th>title</th>\n",
274 |
" <th>full_text</th>\n",
275 |
" <th>abstract</th>\n",
276 |
" <th>text_no_abstract</th>\n",
277 |
" </tr>\n",
278 |
" </thead>\n",
279 |
" <tbody>\n",
280 |
" <tr>\n",
281 |
" <th>0</th>\n",
282 |
" <td>0704.0002</td>\n",
283 |
" <td>Sparsity-certifying Graph Decompositions</td>\n",
284 |
" <td>Descomposiciones del gráfico de certificación ...</td>\n",
285 |
" <td>Describimos un nuevo algoritmo, el juego de ...</td>\n",
286 |
" <td>Introducción y preliminares\\nEl foco de este d...</td>\n",
287 |
" </tr>\n",
288 |
" <tr>\n",
289 |
" <th>1</th>\n",
290 |
" <td>0704.0003</td>\n",
291 |
" <td>The evolution of the Earth-Moon system based o...</td>\n",
292 |
" <td>La evolución del sistema Tierra-Luna basado en...</td>\n",
293 |
" <td>La evolución del sistema Tierra-Luna es desc...</td>\n",
294 |
" <td>Introducción \\nLa teoría aceptada popularmente...</td>\n",
295 |
" </tr>\n",
296 |
" <tr>\n",
297 |
" <th>2</th>\n",
298 |
" <td>0704.0004</td>\n",
299 |
" <td>A determinant of Stirling cycle numbers counts...</td>\n",
300 |
" <td>Un determinante de los números de ciclo de Sti...</td>\n",
301 |
" <td>Demostramos que un determinante de los númer...</td>\n",
302 |
" <td>Introducción El propósito principal de este ar...</td>\n",
303 |
" </tr>\n",
304 |
" <tr>\n",
305 |
" <th>3</th>\n",
306 |
" <td>0704.0005</td>\n",
307 |
" <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
308 |
309 |
" <td>En este artículo mostramos cómo calcular la ...</td>\n",
310 |
311 |
" </tr>\n",
312 |
" <tr>\n",
313 |
" <th>4</th>\n",
314 |
" <td>0704.0007</td>\n",
315 |
" <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
316 |
" <td>La mecánica cuántica de polímeros y su límite ...</td>\n",
317 |
" <td>Una representación cuántica no estándar de l...</td>\n",
318 |
" <td>La mecánica cuántica de polímeros y su límite ...</td>\n",
319 |
" </tr>\n",
320 |
" <tr>\n",
321 |
" <th>...</th>\n",
322 |
" <td>...</td>\n",
323 |
" <td>...</td>\n",
324 |
" <td>...</td>\n",
325 |
" <td>...</td>\n",
326 |
" <td>...</td>\n",
327 |
" </tr>\n",
328 |
" <tr>\n",
329 |
" <th>1749</th>\n",
330 |
" <td>0704.1996</td>\n",
331 |
" <td>A Wave-function for Stringy Universes</td>\n",
332 |
" <td>LPTENS–07/16\\nAbril de 2007\\nUna función de on...</td>\n",
333 |
" <td>Definimos una función de onda para los fondo...</td>\n",
334 |
" <td>Introducción\\nNuestro objetivo en este documen...</td>\n",
335 |
" </tr>\n",
336 |
" <tr>\n",
337 |
" <th>1750</th>\n",
338 |
" <td>0704.1997</td>\n",
339 |
" <td>Query on Negative Temperature, Internal Intera...</td>\n",
340 |
" <td>Microsoft Word - negEntr.doc\\nConsulta sobre t...</td>\n",
341 |
" <td>Después de que la temperatura negativa se vu...</td>\n",
342 |
" <td>Microsoft Word - negEntr.doc\\nConsulta sobre t...</td>\n",
343 |
" </tr>\n",
344 |
" <tr>\n",
345 |
" <th>1751</th>\n",
346 |
" <td>0704.1998</td>\n",
347 |
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
348 |
" <td>Ausencia del problema de la quinta fuerza en u...</td>\n",
349 |
" <td>Un modelo de escala invariante que contiene ...</td>\n",
350 |
" <td>Introducción\\n\\tBase de Dos Medidas Teoría de ...</td>\n",
351 |
" </tr>\n",
352 |
" <tr>\n",
353 |
" <th>1752</th>\n",
354 |
" <td>0704.1999</td>\n",
355 |
" <td>Dark matter caustics and the enhancement of se...</td>\n",
356 |
" <td>Proyecto de versión 16 de noviembre de 2018\\nT...</td>\n",
357 |
" <td>Los haloes fríos de materia oscura están pob...</td>\n",
358 |
" <td>Proyecto de versión 16 de noviembre de 2018\\nT...</td>\n",
359 |
" </tr>\n",
360 |
" <tr>\n",
361 |
" <th>1753</th>\n",
362 |
" <td>0704.2000</td>\n",
363 |
" <td>Search for a Higgs boson produced in associati...</td>\n",
364 |
" <td>FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi...</td>\n",
365 |
" <td>Describimos una búsqueda para el modelo está...</td>\n",
366 |
" <td>FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi...</td>\n",
367 |
" </tr>\n",
368 |
" </tbody>\n",
369 |
370 |
"<p>1754 rows × 5 columns</p>\n",
371 |
372 |
373 |
"text/plain": [
374 |
" id title \\\n",
375 |
"0 0704.0002 Sparsity-certifying Graph Decompositions \n",
376 |
"1 0704.0003 The evolution of the Earth-Moon system based o... \n",
377 |
"2 0704.0004 A determinant of Stirling cycle numbers counts... \n",
378 |
"3 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
379 |
"4 0704.0007 Polymer Quantum Mechanics and its Continuum Limit \n",
380 |
"... ... ... \n",
381 |
"1749 0704.1996 A Wave-function for Stringy Universes \n",
382 |
"1750 0704.1997 Query on Negative Temperature, Internal Intera... \n",
383 |
"1751 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
384 |
"1752 0704.1999 Dark matter caustics and the enhancement of se... \n",
385 |
"1753 0704.2000 Search for a Higgs boson produced in associati... \n",
386 |
387 |
" full_text \\\n",
388 |
"0 Descomposiciones del gráfico de certificación ... \n",
389 |
"1 La evolución del sistema Tierra-Luna basado en... \n",
390 |
"2 Un determinante de los números de ciclo de Sti... \n",
391 |
392 |
"4 La mecánica cuántica de polímeros y su límite ... \n",
393 |
"... ... \n",
394 |
"1749 LPTENS–07/16\\nAbril de 2007\\nUna función de on... \n",
395 |
"1750 Microsoft Word - negEntr.doc\\nConsulta sobre t... \n",
396 |
"1751 Ausencia del problema de la quinta fuerza en u... \n",
397 |
"1752 Proyecto de versión 16 de noviembre de 2018\\nT... \n",
398 |
"1753 FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... \n",
399 |
400 |
" abstract \\\n",
401 |
"0 Describimos un nuevo algoritmo, el juego de ... \n",
402 |
"1 La evolución del sistema Tierra-Luna es desc... \n",
403 |
"2 Demostramos que un determinante de los númer... \n",
404 |
"3 En este artículo mostramos cómo calcular la ... \n",
405 |
"4 Una representación cuántica no estándar de l... \n",
406 |
"... ... \n",
407 |
"1749 Definimos una función de onda para los fondo... \n",
408 |
"1750 Después de que la temperatura negativa se vu... \n",
409 |
"1751 Un modelo de escala invariante que contiene ... \n",
410 |
"1752 Los haloes fríos de materia oscura están pob... \n",
411 |
"1753 Describimos una búsqueda para el modelo está... \n",
412 |
413 |
" text_no_abstract \n",
414 |
"0 Introducción y preliminares\\nEl foco de este d... \n",
415 |
"1 Introducción \\nLa teoría aceptada popularmente... \n",
416 |
"2 Introducción El propósito principal de este ar... \n",
417 |
418 |
"4 La mecánica cuántica de polímeros y su límite ... \n",
419 |
"... ... \n",
420 |
"1749 Introducción\\nNuestro objetivo en este documen... \n",
421 |
"1750 Microsoft Word - negEntr.doc\\nConsulta sobre t... \n",
422 |
"1751 Introducción\\n\\tBase de Dos Medidas Teoría de ... \n",
423 |
"1752 Proyecto de versión 16 de noviembre de 2018\\nT... \n",
424 |
"1753 FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... \n",
425 |
426 |
"[1754 rows x 5 columns]"
427 |
428 |
429 |
"execution_count": 77,
430 |
"metadata": {},
431 |
"output_type": "execute_result"
432 |
433 |
434 |
"source": [
435 |
"es = df[['id','title','translated','translated_abstract','translated_no_abstract']]\n",
436 |
"es.columns = [\"id\",\"title\", \"full_text\",\"abstract\",\"text_no_abstract\"]\n",
437 |
438 |
439 |
440 |
441 |
442 |
"cell_type": "code",
443 |
"execution_count": 79,
444 |
"metadata": {},
445 |
"outputs": [
446 |
447 |
"data": {
448 |
"text/html": [
449 |
450 |
"<style scoped>\n",
451 |
" .dataframe tbody tr th:only-of-type {\n",
452 |
" vertical-align: middle;\n",
453 |
" }\n",
454 |
455 |
" .dataframe tbody tr th {\n",
456 |
" vertical-align: top;\n",
457 |
" }\n",
458 |
459 |
" .dataframe thead th {\n",
460 |
" text-align: right;\n",
461 |
" }\n",
462 |
463 |
"<table border=\"1\" class=\"dataframe\">\n",
464 |
" <thead>\n",
465 |
" <tr style=\"text-align: right;\">\n",
466 |
" <th></th>\n",
467 |
" <th>id</th>\n",
468 |
" <th>title</th>\n",
469 |
" <th>full_text</th>\n",
470 |
" <th>abstract</th>\n",
471 |
" <th>text_no_abstract</th>\n",
472 |
" </tr>\n",
473 |
" </thead>\n",
474 |
" <tbody>\n",
475 |
" <tr>\n",
476 |
" <th>0</th>\n",
477 |
" <td>0704.0002</td>\n",
478 |
" <td>Sparsity-certifying Graph Decompositions</td>\n",
479 |
" <td>Sparsity-certifying Graph Decompositions\\nIlea...</td>\n",
480 |
" <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
481 |
" <td>Introduction and preliminaries\\nThe focus of t...</td>\n",
482 |
" </tr>\n",
483 |
" <tr>\n",
484 |
" <th>1</th>\n",
485 |
" <td>0704.0003</td>\n",
486 |
" <td>The evolution of the Earth-Moon system based o...</td>\n",
487 |
" <td>The evolution of the Earth-Moon system based o...</td>\n",
488 |
" <td>The evolution of Earth-Moon system is descri...</td>\n",
489 |
" <td>Introduction \\nThe popularly accepted theory f...</td>\n",
490 |
" </tr>\n",
491 |
" <tr>\n",
492 |
" <th>2</th>\n",
493 |
" <td>0704.0004</td>\n",
494 |
" <td>A determinant of Stirling cycle numbers counts...</td>\n",
495 |
" <td>A Determinant of Stirling Cycle Numbers Counts...</td>\n",
496 |
" <td>We show that a determinant of Stirling cycle...</td>\n",
497 |
" <td>Introduction The chief purpose of this paper i...</td>\n",
498 |
" </tr>\n",
499 |
" <tr>\n",
500 |
" <th>3</th>\n",
501 |
" <td>0704.0005</td>\n",
502 |
" <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
503 |
504 |
" <td>In this paper we show how to compute the $\\L...</td>\n",
505 |
506 |
" </tr>\n",
507 |
" <tr>\n",
508 |
" <th>4</th>\n",
509 |
" <td>0704.0007</td>\n",
510 |
" <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
511 |
" <td>Polymer Quantum Mechanics and its Continuum Li...</td>\n",
512 |
" <td>A rather non-standard quantum representation...</td>\n",
513 |
" <td>Polymer Quantum Mechanics and its Continuum Li...</td>\n",
514 |
" </tr>\n",
515 |
" <tr>\n",
516 |
" <th>...</th>\n",
517 |
" <td>...</td>\n",
518 |
" <td>...</td>\n",
519 |
" <td>...</td>\n",
520 |
" <td>...</td>\n",
521 |
" <td>...</td>\n",
522 |
" </tr>\n",
523 |
" <tr>\n",
524 |
" <th>1749</th>\n",
525 |
" <td>0704.1996</td>\n",
526 |
" <td>A Wave-function for Stringy Universes</td>\n",
527 |
" <td>LPTENS–07/16\\nApril 2007\\nA Wave-function for ...</td>\n",
528 |
" <td>We define a wave-function for string theory ...</td>\n",
529 |
" <td>Introduction\\nOur goal in this paper is to emb...</td>\n",
530 |
" </tr>\n",
531 |
" <tr>\n",
532 |
" <th>1750</th>\n",
533 |
" <td>0704.1997</td>\n",
534 |
" <td>Query on Negative Temperature, Internal Intera...</td>\n",
535 |
" <td>Microsoft Word - negEntr.doc\\nQuery on Negativ...</td>\n",
536 |
" <td>After negative temperature is restated, we f...</td>\n",
537 |
" <td>Microsoft Word - negEntr.doc\\nQuery on Negativ...</td>\n",
538 |
" </tr>\n",
539 |
" <tr>\n",
540 |
" <th>1751</th>\n",
541 |
" <td>0704.1998</td>\n",
542 |
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
543 |
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
544 |
" <td>A scale invariant model containing dilaton $...</td>\n",
545 |
" <td>Introduction\\n\\tBasis of Two Measures Field Th...</td>\n",
546 |
" </tr>\n",
547 |
" <tr>\n",
548 |
" <th>1752</th>\n",
549 |
" <td>0704.1999</td>\n",
550 |
" <td>Dark matter caustics and the enhancement of se...</td>\n",
551 |
" <td>Draft version November 16, 2018\\nPreprint type...</td>\n",
552 |
" <td>Cold dark matter haloes are populated by cau...</td>\n",
553 |
" <td>Draft version November 16, 2018\\nPreprint type...</td>\n",
554 |
" </tr>\n",
555 |
" <tr>\n",
556 |
" <th>1753</th>\n",
557 |
" <td>0704.2000</td>\n",
558 |
" <td>Search for a Higgs boson produced in associati...</td>\n",
559 |
" <td>FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso...</td>\n",
560 |
" <td>We describe a search for the standard model ...</td>\n",
561 |
" <td>FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso...</td>\n",
562 |
" </tr>\n",
563 |
" </tbody>\n",
564 |
565 |
"<p>1754 rows × 5 columns</p>\n",
566 |
567 |
568 |
"text/plain": [
569 |
" id title \\\n",
570 |
"0 0704.0002 Sparsity-certifying Graph Decompositions \n",
571 |
"1 0704.0003 The evolution of the Earth-Moon system based o... \n",
572 |
"2 0704.0004 A determinant of Stirling cycle numbers counts... \n",
573 |
"3 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
574 |
"4 0704.0007 Polymer Quantum Mechanics and its Continuum Limit \n",
575 |
"... ... ... \n",
576 |
"1749 0704.1996 A Wave-function for Stringy Universes \n",
577 |
"1750 0704.1997 Query on Negative Temperature, Internal Intera... \n",
578 |
"1751 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
579 |
"1752 0704.1999 Dark matter caustics and the enhancement of se... \n",
580 |
"1753 0704.2000 Search for a Higgs boson produced in associati... \n",
581 |
582 |
" full_text \\\n",
583 |
"0 Sparsity-certifying Graph Decompositions\\nIlea... \n",
584 |
"1 The evolution of the Earth-Moon system based o... \n",
585 |
"2 A Determinant of Stirling Cycle Numbers Counts... \n",
586 |
587 |
"4 Polymer Quantum Mechanics and its Continuum Li... \n",
588 |
"... ... \n",
589 |
"1749 LPTENS–07/16\\nApril 2007\\nA Wave-function for ... \n",
590 |
"1750 Microsoft Word - negEntr.doc\\nQuery on Negativ... \n",
591 |
"1751 Absence of the Fifth Force Problem in a Model ... \n",
592 |
"1752 Draft version November 16, 2018\\nPreprint type... \n",
593 |
"1753 FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... \n",
594 |
595 |
" abstract \\\n",
596 |
"0 We describe a new algorithm, the $(k,\\ell)$-... \n",
597 |
"1 The evolution of Earth-Moon system is descri... \n",
598 |
"2 We show that a determinant of Stirling cycle... \n",
599 |
"3 In this paper we show how to compute the $\\L... \n",
600 |
"4 A rather non-standard quantum representation... \n",
601 |
"... ... \n",
602 |
"1749 We define a wave-function for string theory ... \n",
603 |
"1750 After negative temperature is restated, we f... \n",
604 |
"1751 A scale invariant model containing dilaton $... \n",
605 |
"1752 Cold dark matter haloes are populated by cau... \n",
606 |
"1753 We describe a search for the standard model ... \n",
607 |
608 |
" text_no_abstract \n",
609 |
"0 Introduction and preliminaries\\nThe focus of t... \n",
610 |
"1 Introduction \\nThe popularly accepted theory f... \n",
611 |
"2 Introduction The chief purpose of this paper i... \n",
612 |
613 |
"4 Polymer Quantum Mechanics and its Continuum Li... \n",
614 |
"... ... \n",
615 |
"1749 Introduction\\nOur goal in this paper is to emb... \n",
616 |
"1750 Microsoft Word - negEntr.doc\\nQuery on Negativ... \n",
617 |
"1751 Introduction\\n\\tBasis of Two Measures Field Th... \n",
618 |
"1752 Draft version November 16, 2018\\nPreprint type... \n",
619 |
"1753 FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... \n",
620 |
621 |
"[1754 rows x 5 columns]"
622 |
623 |
624 |
"execution_count": 79,
625 |
"metadata": {},
626 |
"output_type": "execute_result"
627 |
628 |
629 |
"source": [
630 |
"en = df[['id','title','full_text','abstract','text_no_abstract']]\n",
631 |
"en.columns = [\"id\",\"title\", \"full_text\",\"abstract\",\"text_no_abstract\"]\n",
632 |
633 |
634 |
635 |
636 |
637 |
"metadata": {
638 |
"interpreter": {
639 |
"hash": "05def4d9d0834781cbeb6b95fd92421f8bd6a45e945308f90d88567f4afc1911"
640 |
641 |
"kernelspec": {
642 |
"display_name": "Python 3.8.12 ('tensorflow')",
643 |
"language": "python",
644 |
"name": "python3"
645 |
646 |
"language_info": {
647 |
"codemirror_mode": {
648 |
"name": "ipython",
649 |
"version": 3
650 |
651 |
"file_extension": ".py",
652 |
"mimetype": "text/x-python",
653 |
"name": "python",
654 |
"nbconvert_exporter": "python",
655 |
"pygments_lexer": "ipython3",
656 |
"version": "3.9.7"
657 |
658 |
"orig_nbformat": 4
659 |
660 |
"nbformat": 4,
661 |
"nbformat_minor": 2
662 |