Spaces:

preview
/

Publication_Crawler

Sleeping

App Files Files Community

ntphuc149 commited on Apr 10

Commit

ce57c5d

verified ·

1 Parent(s): 1d9bdc2

Upload 5 files

Browse files

Files changed (5) hide show

.streamlit/config.toml +3 -0
app.py +31 -0
requirements.txt +6 -0
utils/__init__.py +0 -0
utils/basic_crawl_gg_scholar.py +162 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[theme]
+base="light"
+font="serif"

app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import time
+import requests
+import pandas as pd
+import streamlit as st
+from utils.basic_crawl_gg_scholar import scrape_gg_scholar
+from utils.retrieve_doi_by_name import get_doi_by_title
+from utils.get_abstract_by_doi import get_abstract_by_doi
+st.set_page_config(page_title="GG Scholar Crawler :v", page_icon=":book:", layout="centered")
+st.title("Google Scholar Crawler :book:")
+col_1, col_2, col_3, col_4 = st.columns(spec=[5, 1, 1, 1])
+keyword = col_1.text_input("Keyword to search:", key="keyword", placeholder="Enter keyword to search...", label_visibility="hidden")
+num_pages = col_2.number_input("Pages:", key="pages", placeholder="Number of pages:...", min_value=1, max_value=9999999, value=1, step=1)
+start_year = col_3.number_input("Start:", min_value=1900, max_value=2025, value=2020, key="start_year")
+end_year = col_4.number_input("End:", min_value=1900, max_value=2025, value=2025, key="end_year")
+is_start = st.button("Crawl!", key="crawl_button")
+if is_start:
+    with st.spinner("Crawling basic info..."):
+        basic_crawled_data = scrape_gg_scholar(query=keyword, num_pages=num_pages, start_year=start_year, end_year=end_year)
+        st.dataframe(basic_crawled_data, use_container_width=True)
+        st.success("Crawled basic info successfully!")
+    # with st.spinner("Retrieving DOI..."):
+    #     doi_crawled_data = get_doi_by_title(basic_crawled_data)
+    #     st.dataframe(doi_crawled_data, use_container_width=True)
+    #     st.success("Retrieved DOI successfully!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+requests
+feedparser
+beautifulsoup4
+urllib3
+pandas
+streamlit

utils/__init__.py ADDED Viewed

File without changes

utils/basic_crawl_gg_scholar.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import re
+import csv
+import json
+import time
+import random
+import requests
+import feedparser
+from bs4 import BeautifulSoup
+from urllib.parse import quote
+from datetime import datetime
+def extract_data_from_publication_info(publication_info):
+    authors = ""
+    journal = ""
+    year = ""
+    publisher = ""
+    regex_pattern = r"(.+?)\s+-\s+(.+?),\s+(\d{4})\s+-\s+(.+)$"
+    match = re.match(regex_pattern, publication_info)
+    if match:
+        authors = match.group(1).strip()
+        journal = match.group(2).strip()
+        year = match.group(3).strip()
+        publisher = match.group(4).strip()
+    else:
+        authors = "Unknown"
+        journal = "Unknown"
+        year = "Unknown"
+        publisher = "Unknown"
+    return {
+        "authors": authors,
+        "journal": journal,
+        "year": year,
+        "publisher": publisher
+    }
+def scrape_gg_scholar(query, num_pages=1, start_year=None, end_year=None):
+    results = []
+    header = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+    }
+    base_url = "https://scholar.google.com/scholar?"
+    params = {
+        "q": query.replace(" ", "+"),
+        "hl": "en",
+        "as_sdt": "0,5"
+    }
+    if start_year and end_year:
+        params["as_ylo"] = start_year
+        params["as_yhi"] = end_year
+    for pages in range(num_pages):
+        start = pages * 10
+        params["start"] = start
+        url_params = "&".join([f"{k}={v}" for k, v in params.items()])
+        url = base_url + url_params
+        try:
+            response = requests.get(url, headers=header)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "html.parser")
+                articles = soup.select(".gs_r.gs_or.gs_scl")
+                for article in articles:
+                    title_element = article.select_one(".gs_rt a")
+                    if title_element:
+                        title = title_element.text
+                        link = title_element["href"]
+                        abstract_element = article.select_one(".gs_rs")
+                        abstract = abstract_element.text if abstract_element else ""
+                        abstract = abstract.replace("…", "").strip()
+                        abstract = abstract.replace("\n", "").strip()
+                        abstract = " ".join(abstract.split())
+                        pub_info_element = article.select_one(".gs_a")
+                        pub_info = pub_info_element.text if pub_info_element else ""
+                        pub_info_parsed = extract_data_from_publication_info(pub_info)
+                        results.append({
+                            "GGS_title": title,
+                            "GGS_link": link,
+                            "GGS_brief_abstract": abstract,
+                            "GGS_publication_info": pub_info,
+                            "GGS_authors": pub_info_parsed["authors"],
+                            "GGS_journal": pub_info_parsed["journal"],
+                            "GGS_year": pub_info_parsed["year"],
+                            "GGS_publisher": pub_info_parsed["publisher"]
+                        })
+                time.sleep(random.uniform(1, 3))  # Random sleep to avoid being blocked
+            else:
+                print(f"ERROR: STATUS CODE {response.status_code}")
+                break
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            break
+    return results
+def save_to_csv(data, filename="base_crawling_from_gg_scholar.csv"):
+    with open(filename, "w", newline="", encoding="utf-8-sig") as csvfile:
+        fieldnames = data[0].keys()
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in data:
+            writer.writerow(row)
+    print(f"Data saved to {filename}")
+def main():
+    query = input("Enter your search query: ")
+    num_pages = int(input("Enter the number of pages to scrape: "))
+    use_time_filter = input("Do you want to filter by year? (y/n): ").strip().lower()
+    start_year = None
+    end_year = None
+    if use_time_filter == 'y':
+        start_year = input("Enter the start year (format: YYYY; for example: 2020): ")
+        end_year = input("Enter the end year (format: YYYY; for example: 2025): ")
+    results = scrape_gg_scholar(query, num_pages, start_year, end_year)
+    print(f"Found {len(results)} results.")
+    for i, result in enumerate(results):
+        print(f"{i + 1}. {result['title']}")
+        print(f"   Link: {result['link']}")
+        print(f"   Brief Abstract: {result['brief_abstract']}")
+        print(f"   Publication Info: {result['publication_info']}")
+        print(f"   Authors: {result['authors']}")
+        print(f"   Journal: {result['journal']}")
+        print(f"   Year: {result['year']}")
+        print(f"   Publisher: {result['publisher']}")
+        print("=" * 100)
+    save_option = input("Do you want to save the results to a CSV file? (y/n): ").strip().lower()
+    if save_option == 'y':
+        file_name = input("Enter the filename (default file name is 'base_crawling_from_gg_scholar.csv', enter fine name without extension): ")
+        save_to_csv(results)
+if __name__ == "__main__":
+    main()
+# The code is designed to scrape Google Scholar for academic articles based on a search query.