arXiv2Latex / backend.py
yiyixin's picture
upload
6be34e2
raw
history blame
2.46 kB
import tarfile
import os
import requests
import datetime
import pandas as pd
import shutil
from bs4 import BeautifulSoup
from tqdm import tqdm
import base64
def ToBase64(file):
with open(file, 'rb') as fileObj:
data = fileObj.read()
base64_data = base64.b64encode(data)
return base64_data
def archive_dir(dir_name,output_filename,format="zip"):
shutil.make_archive(output_filename, format, dir_name)
return output_filename+".zip"
def make_dir_if_not_exist(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def untar(fname, dirs):
"""
解压tar.gz文件
:param fname: 压缩文件名
:param dirs: 解压后的存放路径
:return: bool
"""
try:
t = tarfile.open(fname)
t.extractall(path = dirs)
return True
except Exception as e:
print(e)
return False
def get_timestamp():
ts = pd.to_datetime(str(datetime.datetime.now()))
d = ts.strftime('%Y%m%d%H%M%S')
return d
def get_name_from_arvix(url):
res = BeautifulSoup(requests.get(url).content, 'lxml').find("h1",attrs={"class":"title mathjax"})
if res is None:
return ''
title = res.text[6:].replace(" ","-")
return title
def download_source(pdf_lists=None,output_base=None,project_name=None,fetch_title=True, return_source=False):
base=output_base
project_name = project_name + get_timestamp()
base = os.path.join(base,project_name)
make_dir_if_not_exist(base)
for pdf_link in tqdm(pdf_lists):
file_stamp = pdf_link.split("/")[-1]
if fetch_title:
title = get_name_from_arvix(pdf_link)
if len(title )== 0:
continue
else:
import numpy as np
title = file_stamp
source_link = "https://arxiv.org/e-print/"+file_stamp
inp = os.path.join(base,'input')
make_dir_if_not_exist(inp)
out = os.path.join(base,'output')
make_dir_if_not_exist(out)
if return_source:
print(source_link)
continue
response = requests.get(source_link)
filename = file_stamp+".tar.gz"
filepath = os.path.join(inp,filename)
open(filepath, "wb").write(response.content)
outpath = os.path.join(out,title)
untar(filepath,outpath)
archive_dir(out,os.path.join(base,project_name))
if __name__ == '__main__':
s = get_timestamp()
print(s)