File size: 3,136 Bytes
6eff5e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
from pypdf import PdfReader
from urllib.parse import urlparse
import requests
from semanticscholar import SemanticScholar

### Input Formatting Module

## Input formatting for the given paper
# Extracting text from a pdf or a link

def get_text_from_pdf(file_path):
    """
    Convert a pdf to list of text files
    """
    reader = PdfReader(file_path)
    text = []
    for p in reader.pages:
        t = p.extract_text()
        text.append(t)
    return text

def get_text_from_url(url, file_path='paper.pdf'):
    """
    Get text of the paper from a url
    """
    # TODO check for other valid urls (e.g. semantic scholar)

    ## Check for different URL cases
    url_parts = urlparse(url)
    # arxiv
    if 'arxiv' in url_parts.netloc:
        if 'abs' in url_parts.path:
            # abstract page, change the url to pdf link
            paper_id = url_parts.path.split('/')[-1]
            url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
        elif 'pdf' in url_parts.path:
            # pdf file, pass
            pass
        else:
            raise ValueError('invalid url')
    else:
        raise ValueError('invalid url')

    # download the file
    download_pdf(url, file_path)

    # get the text from the pdf file
    text = get_text_from_pdf(file_path)
    return text
    
def download_pdf(url, file_name):
    """
    Download the pdf file from given url and save it as file_name
    """
    # Send GET request
    response = requests.get(url)

    # Save the PDF
    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
    elif response.status_code == 404:
        raise ValueError('cannot download the file')
    else:
        print(response.status_code)
        
## Input formatting for the given author (reviewer)
# Extracting text from a link

def get_text_from_author_id(author_id, max_count=100):
    if author_id is None:
        raise ValueError('Input valid author ID')
    author_id = str(author_id)
#     author_id = '1737249'
    url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id
    r = requests.get(url)
    if r.status_code == 404:
        raise ValueError('Input valid author ID')
    data = r.json()
    papers = data['papers'][:max_count]
    name = data['name']

    return name, papers

## TODO Preprocess Extracted Texts from PDFs
# Get a portion of the text for actual task

def get_title(text):
    pass

def get_abstract(text):
    pass

def get_introduction(text):
    pass

def get_conclusion(text):
    pass


if __name__ == '__main__':
    def run_sample():
        url = 'https://arxiv.org/abs/2105.06506'
        text = get_text_from_url(url)
        assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
        
        text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf')
        assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
        
        # text = get_text_from_url('https://arxiv.org/paetseths.pdf')

    # test the code
    run_sample()