import gradio as gr from transformers import pipeline from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import requests NEL_MODEL_NAME = "impresso-project/nel-mgenre-multilingual" # Load the tokenizer and model from the specified pre-trained model name # The model used here is "https://huggingface.co/impresso-project/nel-mgenre-multilingual" nel_tokenizer = AutoTokenizer.from_pretrained( "impresso-project/nel-mgenre-multilingual" ) nel_pipeline = pipeline( "generic-nel", model=NEL_MODEL_NAME, tokenizer=nel_tokenizer, trust_remote_code=True, device="cpu", ) print("Model loaded successfully!") def get_wikipedia_page_props(input_str: str): """ Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia. If the request fails, it falls back to using the OpenRefine Wikidata API. Args: input_str (str): The input string in the format "page_name >> language". Returns: str: The QID or "NIL" if the QID is not found. """ try: # Preprocess the input string page_name, language = input_str.split(" >> ") page_name = page_name.strip() language = language.strip() except ValueError: return "Invalid input format. Use 'page_name >> language'." wikipedia_url = f"https://{language}.wikipedia.org/w/api.php" wikipedia_params = { "action": "query", "prop": "pageprops", "format": "json", "titles": page_name, } qid = "NIL" try: # Attempt to fetch from Wikipedia API response = requests.get(wikipedia_url, params=wikipedia_params) response.raise_for_status() data = response.json() if "pages" in data["query"]: page_id = list(data["query"]["pages"].keys())[0] if "pageprops" in data["query"]["pages"][page_id]: page_props = data["query"]["pages"][page_id]["pageprops"] if "wikibase_item" in page_props: return page_props["wikibase_item"] else: return qid else: return qid except Exception as e: return qid def get_wikipedia_title(qid, language="en"): url = f"https://www.wikidata.org/w/api.php" params = { "action": "wbgetentities", "format": "json", "ids": qid, "props": "sitelinks/urls", "sitefilter": f"{language}wiki", } response = requests.get(url, params=params) data = response.json() try: title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"] url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"] return title, url except KeyError: return "NIL", "None" def disambiguate_sentence(sentence): # Generate model outputs for the sentence linked_entity = nel_pipeline(sentence) linked_entity = linked_entity[0] # Create an HTML output with a clickable link entity_info = f"""