{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "from easynmt import EasyNMT\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "URL_BASE = \"https://arxiv.org/pdf/\"\n", "PDF_PATH = 'PDF'\n", "TXT_PATH= 'TXT'\n", "CSV_PATH = 'CSV'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Get Data from TXT" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n", "df = None\n", "for i in data:\n", " df = i \n", " print(type(i))\n", " break\n", "df = df[['id','title','abstract']]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for file in df['id']:\n", " file_path = TXT_PATH+'/'+str(file)+'.pdf.txt'\n", " if os.path.isfile(file_path):\n", " with open(file_path,'r',encoding='utf8') as f:\n", " s =str( f.read()) \n", " df.loc[df['id'] == str(file),'full_text'] = s " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.dropna()\n", "df.reset_index()\n", "df.to_csv(CSV_PATH+'/scientific_paper_en.csv',index=False,encoding='utf-8')\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# first run \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(CSV_PATH +'/scientific_paper_en.csv',dtype={'id':'str'})\n", "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# leer datos" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n", "print(len(df.index))\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# translate" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = EasyNMT('opus-mt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## translate full text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max = len(df.index)\n", "for i in range(0,1754):\n", " text = df.iloc[i]['full_text']\n", " translated_text = model.translate(text, target_lang='es')\n", " df.loc[i,'translated'] = translated_text \n", " print(\"listo documento \",i)\n", " if(i%10==0):\n", " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n", "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## translate abstract" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max = len(df.index)\n", "for i in range(0,1754):\n", " text = df.iloc[i]['abstract']\n", " translated_text = model.translate(text, target_lang='es')\n", " df.loc[i,'translated_abstract'] = translated_text \n", " print(\"listo documento \",i)\n", " if(i%100==0):\n", " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n", "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# remove abstract" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max = len(df.index)-1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end = 'Introducción'\n", "for i in range(0,max):\n", " text = df.iloc[i]['translated'] \n", " p = text.find(end)\n", " if(p != -1): \n", " df.loc[i,'translated_no_abstract'] = text[p:] \n", " else:\n", " df.loc[i,'translated_no_abstract']= text\n", " print(\"listo documento \",i,p)\n", " if(i%1000==0):\n", " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n", "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end = 'Abstract'\n", "for i in range(0,max):\n", " text = df.iloc[i]['full_text'] \n", " p = text.find(end)\n", " if(p != -1): \n", " df.loc[i,'text_no_abstract'] = text[p:] \n", " else:\n", " df.loc[i,'text_no_abstract']= text \n", " if(i%1000==0):\n", " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n", "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# split data to csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n", "df" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "title | \n", "full_text | \n", "abstract | \n", "text_no_abstract | \n", "
---|---|---|---|---|---|
0 | \n", "0704.0002 | \n", "Sparsity-certifying Graph Decompositions | \n", "Descomposiciones del gráfico de certificación ... | \n", "Describimos un nuevo algoritmo, el juego de ... | \n", "Introducción y preliminares\\nEl foco de este d... | \n", "
1 | \n", "0704.0003 | \n", "The evolution of the Earth-Moon system based o... | \n", "La evolución del sistema Tierra-Luna basado en... | \n", "La evolución del sistema Tierra-Luna es desc... | \n", "Introducción \\nLa teoría aceptada popularmente... | \n", "
2 | \n", "0704.0004 | \n", "A determinant of Stirling cycle numbers counts... | \n", "Un determinante de los números de ciclo de Sti... | \n", "Demostramos que un determinante de los númer... | \n", "Introducción El propósito principal de este ar... | \n", "
3 | \n", "0704.0005 | \n", "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", "DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC... | \n", "En este artículo mostramos cómo calcular la ... | \n", "DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC... | \n", "
4 | \n", "0704.0007 | \n", "Polymer Quantum Mechanics and its Continuum Limit | \n", "La mecánica cuántica de polímeros y su límite ... | \n", "Una representación cuántica no estándar de l... | \n", "La mecánica cuántica de polímeros y su límite ... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1749 | \n", "0704.1996 | \n", "A Wave-function for Stringy Universes | \n", "LPTENS–07/16\\nAbril de 2007\\nUna función de on... | \n", "Definimos una función de onda para los fondo... | \n", "Introducción\\nNuestro objetivo en este documen... | \n", "
1750 | \n", "0704.1997 | \n", "Query on Negative Temperature, Internal Intera... | \n", "Microsoft Word - negEntr.doc\\nConsulta sobre t... | \n", "Después de que la temperatura negativa se vu... | \n", "Microsoft Word - negEntr.doc\\nConsulta sobre t... | \n", "
1751 | \n", "0704.1998 | \n", "Absence of the Fifth Force Problem in a Model ... | \n", "Ausencia del problema de la quinta fuerza en u... | \n", "Un modelo de escala invariante que contiene ... | \n", "Introducción\\n\\tBase de Dos Medidas Teoría de ... | \n", "
1752 | \n", "0704.1999 | \n", "Dark matter caustics and the enhancement of se... | \n", "Proyecto de versión 16 de noviembre de 2018\\nT... | \n", "Los haloes fríos de materia oscura están pob... | \n", "Proyecto de versión 16 de noviembre de 2018\\nT... | \n", "
1753 | \n", "0704.2000 | \n", "Search for a Higgs boson produced in associati... | \n", "FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... | \n", "Describimos una búsqueda para el modelo está... | \n", "FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... | \n", "
1754 rows × 5 columns
\n", "\n", " | id | \n", "title | \n", "full_text | \n", "abstract | \n", "text_no_abstract | \n", "
---|---|---|---|---|---|
0 | \n", "0704.0002 | \n", "Sparsity-certifying Graph Decompositions | \n", "Sparsity-certifying Graph Decompositions\\nIlea... | \n", "We describe a new algorithm, the $(k,\\ell)$-... | \n", "Introduction and preliminaries\\nThe focus of t... | \n", "
1 | \n", "0704.0003 | \n", "The evolution of the Earth-Moon system based o... | \n", "The evolution of the Earth-Moon system based o... | \n", "The evolution of Earth-Moon system is descri... | \n", "Introduction \\nThe popularly accepted theory f... | \n", "
2 | \n", "0704.0004 | \n", "A determinant of Stirling cycle numbers counts... | \n", "A Determinant of Stirling Cycle Numbers Counts... | \n", "We show that a determinant of Stirling cycle... | \n", "Introduction The chief purpose of this paper i... | \n", "
3 | \n", "0704.0005 | \n", "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", "FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL... | \n", "In this paper we show how to compute the $\\L... | \n", "FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL... | \n", "
4 | \n", "0704.0007 | \n", "Polymer Quantum Mechanics and its Continuum Limit | \n", "Polymer Quantum Mechanics and its Continuum Li... | \n", "A rather non-standard quantum representation... | \n", "Polymer Quantum Mechanics and its Continuum Li... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1749 | \n", "0704.1996 | \n", "A Wave-function for Stringy Universes | \n", "LPTENS–07/16\\nApril 2007\\nA Wave-function for ... | \n", "We define a wave-function for string theory ... | \n", "Introduction\\nOur goal in this paper is to emb... | \n", "
1750 | \n", "0704.1997 | \n", "Query on Negative Temperature, Internal Intera... | \n", "Microsoft Word - negEntr.doc\\nQuery on Negativ... | \n", "After negative temperature is restated, we f... | \n", "Microsoft Word - negEntr.doc\\nQuery on Negativ... | \n", "
1751 | \n", "0704.1998 | \n", "Absence of the Fifth Force Problem in a Model ... | \n", "Absence of the Fifth Force Problem in a Model ... | \n", "A scale invariant model containing dilaton $... | \n", "Introduction\\n\\tBasis of Two Measures Field Th... | \n", "
1752 | \n", "0704.1999 | \n", "Dark matter caustics and the enhancement of se... | \n", "Draft version November 16, 2018\\nPreprint type... | \n", "Cold dark matter haloes are populated by cau... | \n", "Draft version November 16, 2018\\nPreprint type... | \n", "
1753 | \n", "0704.2000 | \n", "Search for a Higgs boson produced in associati... | \n", "FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... | \n", "We describe a search for the standard model ... | \n", "FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... | \n", "
1754 rows × 5 columns
\n", "