File size: 3,629 Bytes
c61d3cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from typing import Any, Literal
import httpx
from mcp.server.fastmcp import FastMCP

# Initialize FastMCP server
mcp = FastMCP("arxiv-omar")

# Constants
CUSTOM_ARXIV_API_BASE = "https://om4r932-arxiv.hf.space"
DDG_API_BASE = "https://ychkhan-ptt-endpoints.hf.space"

# Helpers
async def make_request(url: str, data: dict = None) -> dict[str, Any] | None:
    if data is None:
        return None
    headers = {
        "Accept": "application/json"
    }
    async with httpx.AsyncClient(verify=False) as client:
        try:
            response = await client.post(url, headers=headers, json=data)
            print(response)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            return None
        
def format_search(pub_id: str, content: dict) -> str:
    return f"""
        arXiv publication ID : {pub_id}
        Title : {content["title"]}
        Authors : {content["authors"]}
        Release Date : {content["date"]}
        Abstract : {content["abstract"]}
        PDF link : {content["pdf"]}
    """

def format_extract(message: dict) -> str:
    return f"""
        Title of PDF : {message.get("title", "No title has been found")}
        Text : {message.get("text", "No text !")}
    """

def format_result_search(page: dict):
    return f"""
        Title : {page.get("title", "No titles found !")}
        Little description : {page.get("body", "No description")}
        PDF url : {page.get("url", None)}
    """

# Tools
@mcp.tool()
async def get_publications(keyword: str, limit: int = 15) -> str:
    """
    Get arXiv publications based on keywords and limit of documents

    Args:
        keyword: Keywords separated by spaces
        limit: Numbers of maximum publications returned (by default, 15)
    """
    url = f"{CUSTOM_ARXIV_API_BASE}/search"
    data = await make_request(url, data={'keyword': keyword, 'limit': limit})
    if data["error"]:
        return data["message"]
    if not data:
        return "Unable to fetch publications"
    if len(data["message"].keys()) == 0:
        return "No publications found"
    
    publications = [format_search(pub_id, content) for (pub_id, content) in data["message"].items()]
    return "\n--\n".join(publications)

@mcp.tool()
async def web_search(query: str) -> str:
    """
    Search the Web (thanks to DuckDuckGo) for all PDF files based on the keywords

    Args:
        query: Keywords to search documents on the Web
    """
    
    url = f"{DDG_API_BASE}/search"
    data = await make_request(url, data={"query": query})
    if not data:
        return "Unable to fetch results"
    if len(data["results"]) == 0:
        return "No results found"
    
    results = [format_result_search(result) for result in data["results"]]
    return "\n--\n".join(results)


@mcp.tool()
async def get_pdf_text(pdf_url: str, limit_page: int = -1) -> str:
    """
    Extract the text from the URL pointing to a PDF file

    Args:
        pdf_url: URL to a PDF document
        limit_page: How many pages the user wants to extract the content (default: -1 for all pages)
    """

    url = f"{CUSTOM_ARXIV_API_BASE}/extract_pdf/url"
    data = {"url": pdf_url}
    if limit_page != -1:
        data["page_num"] = limit_page
    data = await make_request(url, data=data)
    if data["error"]:
        return data["message"]
    if not data:
        return "Unable to extract PDF text"
    if len(data["message"].keys()) == 0:
        return "No text can be extracted from this PDF"
    
    return format_extract(data["message"])

if __name__ == "__main__":
    mcp.run(transport="stdio")