Spaces:
Sleeping
Sleeping
import re | |
import csv | |
import json | |
import time | |
import random | |
import requests | |
import feedparser | |
from bs4 import BeautifulSoup | |
from urllib.parse import quote | |
from datetime import datetime | |
def extract_data_from_publication_info(publication_info): | |
authors = "" | |
journal = "" | |
year = "" | |
publisher = "" | |
regex_pattern = r"(.+?)\s+-\s+(.+?),\s+(\d{4})\s+-\s+(.+)$" | |
match = re.match(regex_pattern, publication_info) | |
if match: | |
authors = match.group(1).strip() | |
journal = match.group(2).strip() | |
year = match.group(3).strip() | |
publisher = match.group(4).strip() | |
else: | |
authors = "Unknown" | |
journal = "Unknown" | |
year = "Unknown" | |
publisher = "Unknown" | |
return { | |
"authors": authors, | |
"journal": journal, | |
"year": year, | |
"publisher": publisher | |
} | |
def scrape_gg_scholar(query, num_pages=1, start_year=None, end_year=None): | |
results = [] | |
header = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" | |
} | |
base_url = "https://scholar.google.com/scholar?" | |
params = { | |
"q": query.replace(" ", "+"), | |
"hl": "en", | |
"as_sdt": "0,5" | |
} | |
if start_year and end_year: | |
params["as_ylo"] = start_year | |
params["as_yhi"] = end_year | |
for pages in range(num_pages): | |
start = pages * 10 | |
params["start"] = start | |
url_params = "&".join([f"{k}={v}" for k, v in params.items()]) | |
url = base_url + url_params | |
try: | |
response = requests.get(url, headers=header) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, "html.parser") | |
articles = soup.select(".gs_r.gs_or.gs_scl") | |
for article in articles: | |
title_element = article.select_one(".gs_rt a") | |
if title_element: | |
title = title_element.text | |
link = title_element["href"] | |
abstract_element = article.select_one(".gs_rs") | |
abstract = abstract_element.text if abstract_element else "" | |
abstract = abstract.replace("…", "").strip() | |
abstract = abstract.replace("\n", "").strip() | |
abstract = " ".join(abstract.split()) | |
pub_info_element = article.select_one(".gs_a") | |
pub_info = pub_info_element.text if pub_info_element else "" | |
pub_info_parsed = extract_data_from_publication_info(pub_info) | |
results.append({ | |
"GGS_title": title, | |
"GGS_link": link, | |
"GGS_brief_abstract": abstract, | |
"GGS_publication_info": pub_info, | |
"GGS_authors": pub_info_parsed["authors"], | |
"GGS_journal": pub_info_parsed["journal"], | |
"GGS_year": pub_info_parsed["year"], | |
"GGS_publisher": pub_info_parsed["publisher"] | |
}) | |
time.sleep(random.uniform(1, 3)) # Random sleep to avoid being blocked | |
else: | |
print(f"ERROR: STATUS CODE {response.status_code}") | |
break | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
break | |
return results | |
def save_to_csv(data, filename="base_crawling_from_gg_scholar.csv"): | |
with open(filename, "w", newline="", encoding="utf-8-sig") as csvfile: | |
fieldnames = data[0].keys() | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
print(f"Data saved to {filename}") | |
def main(): | |
query = input("Enter your search query: ") | |
num_pages = int(input("Enter the number of pages to scrape: ")) | |
use_time_filter = input("Do you want to filter by year? (y/n): ").strip().lower() | |
start_year = None | |
end_year = None | |
if use_time_filter == 'y': | |
start_year = input("Enter the start year (format: YYYY; for example: 2020): ") | |
end_year = input("Enter the end year (format: YYYY; for example: 2025): ") | |
results = scrape_gg_scholar(query, num_pages, start_year, end_year) | |
print(f"Found {len(results)} results.") | |
for i, result in enumerate(results): | |
print(f"{i + 1}. {result['title']}") | |
print(f" Link: {result['link']}") | |
print(f" Brief Abstract: {result['brief_abstract']}") | |
print(f" Publication Info: {result['publication_info']}") | |
print(f" Authors: {result['authors']}") | |
print(f" Journal: {result['journal']}") | |
print(f" Year: {result['year']}") | |
print(f" Publisher: {result['publisher']}") | |
print("=" * 100) | |
save_option = input("Do you want to save the results to a CSV file? (y/n): ").strip().lower() | |
if save_option == 'y': | |
file_name = input("Enter the filename (default file name is 'base_crawling_from_gg_scholar.csv', enter fine name without extension): ") | |
save_to_csv(results) | |
if __name__ == "__main__": | |
main() | |
# The code is designed to scrape Google Scholar for academic articles based on a search query. |