Spaces:
Running
Running
import tarfile | |
import os | |
import requests | |
import datetime | |
import pandas as pd | |
import shutil | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
import base64 | |
def ToBase64(file): | |
with open(file, 'rb') as fileObj: | |
data = fileObj.read() | |
base64_data = base64.b64encode(data) | |
return base64_data | |
def archive_dir(dir_name,output_filename,format="zip"): | |
shutil.make_archive(output_filename, format, dir_name) | |
return output_filename+".zip" | |
def make_dir_if_not_exist(folder): | |
if not os.path.exists(folder): | |
os.makedirs(folder) | |
def untar(fname, dirs): | |
""" | |
解压tar.gz文件 | |
:param fname: 压缩文件名 | |
:param dirs: 解压后的存放路径 | |
:return: bool | |
""" | |
try: | |
t = tarfile.open(fname) | |
t.extractall(path = dirs) | |
return True | |
except Exception as e: | |
print(e) | |
return False | |
def get_timestamp(): | |
ts = pd.to_datetime(str(datetime.datetime.now())) | |
d = ts.strftime('%Y%m%d%H%M%S') | |
return d | |
def get_name_from_arvix(url): | |
res = BeautifulSoup(requests.get(url).content, 'lxml').find("h1",attrs={"class":"title mathjax"}) | |
if res is None: | |
return '' | |
title = res.text[6:].replace(" ","-") | |
return title | |
def download_source(pdf_lists=None,output_base=None,project_name=None,fetch_title=True, return_source=False): | |
base=output_base | |
project_name = project_name + get_timestamp() | |
base = os.path.join(base,project_name) | |
make_dir_if_not_exist(base) | |
for pdf_link in tqdm(pdf_lists): | |
file_stamp = pdf_link.split("/")[-1] | |
if fetch_title: | |
title = get_name_from_arvix(pdf_link) | |
if len(title )== 0: | |
continue | |
else: | |
import numpy as np | |
title = file_stamp | |
source_link = "https://arxiv.org/e-print/"+file_stamp | |
inp = os.path.join(base,'input') | |
make_dir_if_not_exist(inp) | |
out = os.path.join(base,'output') | |
make_dir_if_not_exist(out) | |
if return_source: | |
print(source_link) | |
continue | |
response = requests.get(source_link) | |
filename = file_stamp+".tar.gz" | |
filepath = os.path.join(inp,filename) | |
open(filepath, "wb").write(response.content) | |
outpath = os.path.join(out,title) | |
untar(filepath,outpath) | |
archive_dir(out,os.path.join(base,project_name)) | |
if __name__ == '__main__': | |
s = get_timestamp() | |
print(s) |