File size: 3,839 Bytes
c3358e9
 
c8c0cec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3358e9
 
 
 
 
b715940
e3ff1d7
c8c0cec
 
 
 
 
c3358e9
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
from huggingface_hub import InferenceClient
from urllib import request
from lxml import etree

url_prefix_mapping = {
    'acl': 'https://aclanthology.org',
    'emnlp': 'https://aclanthology.org',
    'naacl': 'https://aclanthology.org',
    'tacl': 'https://aclanthology.org',
    'nips': 'https://papers.nips.cc',
    'icml': 'https://papers.nips.cc',
    'iclr': 'https://iclr.cc',
}

mlr_mapping = {
    ('icml', 2020): 'v119',
    ('icml', 2021): 'v139',
    ('icml', 2022): 'v162',
    ('icml', 2023): 'v202',
    ('icml', 2024): 'v139',
}

def get_paper_home(venue, year):
    if venue in ['acl', 'emnlp', 'naacl']:
        return f'https://aclanthology.org/events/{venue}-{year}'
    
    elif venue == 'nips':
        return f'https://papers.{venue}.cc/paper_files/paper/{year}'
    
    elif venue == 'icml':
        return f'https://proceedings.mlr.press/{mlr_mapping[(venue, year)]}'
    
    elif venue == 'iclr':
        return f'https://iclr.cc/Downloads/{year}'


def check_key_words(ele):
    s = ''.join(ele.itertext()).lower()
    url = ele.get('href')
    for i in keywords:
        match = re.search(i, s)
        if match:
            return True

    return False

def check_key_words_icml(ele):
    s = ''.join(ele.find('.//p[@class="title"]').itertext()).lower()
    url = ele.get('href')
    for i in keywords:
        match = re.search(i, s)
        if match:
            return True

    return False


def search(keywords, venues):
    search_venues = []
    if "NeurIPS/ICLR/ICML" in venues:
        search_venues.extend(['nips', 'iclr', 'icml'])
    if "*ACL" in venues:
        search_venues.extend(['acl', 'emnlp', 'naacl', 'tacl'])
    if "CVPR/ECCV/ICCV" in venues:
        search_venues.extend(['nips', 'iclr', 'icml'])

    results = []
    for venue in search_venues:
        if 'acl' in venue:
            paper_tag_on_html = ".//a[@class='align-middle']"
        elif venue == 'iclr':
            paper_tag_on_html = ".//a[@class='Poster']"
        elif venue == 'nips':
            paper_tag_on_html = ".//a[@title='paper title']"
        elif venue == 'icml':
            paper_tag_on_html = ".//div[@class='paper']"
        
        for year in years:
            print(venue, year)
            
            paper_home = get_paper_home(venue, year)
            url_prefix = url_prefix_mapping[venue]
            if venue == 'icml':
                url_prefix = paper_home
    
            try:
                response = request.urlopen(paper_home)
            except:
                continue
    
            html = response.read().decode()
            tree = etree.fromstring(html, etree.HTMLParser())
    
            elements = tree.findall(paper_tag_on_html)
            if venue == 'icml':
                elements = [i for i in elements if check_key_words_icml(i)]
                urls = [i.find('.//p[@class="links"]').find('a').get('href') for i in elements]
                results.extend(urls)
            
            else:
                elements = [i for i in elements if check_key_words(i)]
                urls = [url_prefix + i.find('.//p[@class="links"]').get('href') for i in elements]
                results.extend(urls)
    
            print(len(elements))
            print()

    return results


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.Interface(
    search,
    inputs=[
        gr.Textbox(lines=2, placeholder="Keywords of the paper title. Supports ReGex."),
        gr.CheckboxGroup(["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"], label="Choose Venues to Search", value=["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"])
    ],
    outputs=gr.DataFrame(headers=["Paper Link", ])#"Title", "Authors"
)


if __name__ == "__main__":
    demo.launch()