Spaces:

jskim
/

paper-matching

Runtime error

File size: 3,136 Bytes

6eff5e7

import numpy as np
from pypdf import PdfReader
from urllib.parse import urlparse
import requests
from semanticscholar import SemanticScholar

### Input Formatting Module

## Input formatting for the given paper
# Extracting text from a pdf or a link

def get_text_from_pdf(file_path):
    """
    Convert a pdf to list of text files
    """
    reader = PdfReader(file_path)
    text = []
    for p in reader.pages:
        t = p.extract_text()
        text.append(t)
    return text

def get_text_from_url(url, file_path='paper.pdf'):
    """
    Get text of the paper from a url
    """
    # TODO check for other valid urls (e.g. semantic scholar)

    ## Check for different URL cases
    url_parts = urlparse(url)
    # arxiv
    if 'arxiv' in url_parts.netloc:
        if 'abs' in url_parts.path:
            # abstract page, change the url to pdf link
            paper_id = url_parts.path.split('/')[-1]
            url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
        elif 'pdf' in url_parts.path:
            # pdf file, pass
            pass
        else:
            raise ValueError('invalid url')
    else:
        raise ValueError('invalid url')

    # download the file
    download_pdf(url, file_path)

    # get the text from the pdf file
    text = get_text_from_pdf(file_path)
    return text
    
def download_pdf(url, file_name):
    """
    Download the pdf file from given url and save it as file_name
    """
    # Send GET request
    response = requests.get(url)

    # Save the PDF
    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
    elif response.status_code == 404:
        raise ValueError('cannot download the file')
    else:
        print(response.status_code)
        
## Input formatting for the given author (reviewer)
# Extracting text from a link

def get_text_from_author_id(author_id, max_count=100):
    if author_id is None:
        raise ValueError('Input valid author ID')
    author_id = str(author_id)
#     author_id = '1737249'
    url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id
    r = requests.get(url)
    if r.status_code == 404:
        raise ValueError('Input valid author ID')
    data = r.json()
    papers = data['papers'][:max_count]
    name = data['name']

    return name, papers

## TODO Preprocess Extracted Texts from PDFs
# Get a portion of the text for actual task

def get_title(text):
    pass

def get_abstract(text):
    pass

def get_introduction(text):
    pass

def get_conclusion(text):
    pass


if __name__ == '__main__':
    def run_sample():
        url = 'https://arxiv.org/abs/2105.06506'
        text = get_text_from_url(url)
        assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
        
        text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf')
        assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
        
        # text = get_text_from_url('https://arxiv.org/paetseths.pdf')

    # test the code
    run_sample()