ntphuc149 commited on
Commit
ce57c5d
·
verified ·
1 Parent(s): 1d9bdc2

Upload 5 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ font="serif"
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ import pandas as pd
4
+ import streamlit as st
5
+ from utils.basic_crawl_gg_scholar import scrape_gg_scholar
6
+ from utils.retrieve_doi_by_name import get_doi_by_title
7
+ from utils.get_abstract_by_doi import get_abstract_by_doi
8
+
9
+
10
+ st.set_page_config(page_title="GG Scholar Crawler :v", page_icon=":book:", layout="centered")
11
+
12
+ st.title("Google Scholar Crawler :book:")
13
+
14
+ col_1, col_2, col_3, col_4 = st.columns(spec=[5, 1, 1, 1])
15
+
16
+ keyword = col_1.text_input("Keyword to search:", key="keyword", placeholder="Enter keyword to search...", label_visibility="hidden")
17
+ num_pages = col_2.number_input("Pages:", key="pages", placeholder="Number of pages:...", min_value=1, max_value=9999999, value=1, step=1)
18
+ start_year = col_3.number_input("Start:", min_value=1900, max_value=2025, value=2020, key="start_year")
19
+ end_year = col_4.number_input("End:", min_value=1900, max_value=2025, value=2025, key="end_year")
20
+ is_start = st.button("Crawl!", key="crawl_button")
21
+
22
+ if is_start:
23
+ with st.spinner("Crawling basic info..."):
24
+ basic_crawled_data = scrape_gg_scholar(query=keyword, num_pages=num_pages, start_year=start_year, end_year=end_year)
25
+ st.dataframe(basic_crawled_data, use_container_width=True)
26
+ st.success("Crawled basic info successfully!")
27
+
28
+ # with st.spinner("Retrieving DOI..."):
29
+ # doi_crawled_data = get_doi_by_title(basic_crawled_data)
30
+ # st.dataframe(doi_crawled_data, use_container_width=True)
31
+ # st.success("Retrieved DOI successfully!")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ requests
2
+ feedparser
3
+ beautifulsoup4
4
+ urllib3
5
+ pandas
6
+ streamlit
utils/__init__.py ADDED
File without changes
utils/basic_crawl_gg_scholar.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import csv
3
+ import json
4
+ import time
5
+ import random
6
+ import requests
7
+ import feedparser
8
+ from bs4 import BeautifulSoup
9
+ from urllib.parse import quote
10
+ from datetime import datetime
11
+
12
+ def extract_data_from_publication_info(publication_info):
13
+
14
+ authors = ""
15
+ journal = ""
16
+ year = ""
17
+ publisher = ""
18
+
19
+ regex_pattern = r"(.+?)\s+-\s+(.+?),\s+(\d{4})\s+-\s+(.+)$"
20
+
21
+ match = re.match(regex_pattern, publication_info)
22
+
23
+ if match:
24
+ authors = match.group(1).strip()
25
+ journal = match.group(2).strip()
26
+ year = match.group(3).strip()
27
+ publisher = match.group(4).strip()
28
+ else:
29
+ authors = "Unknown"
30
+ journal = "Unknown"
31
+ year = "Unknown"
32
+ publisher = "Unknown"
33
+
34
+ return {
35
+ "authors": authors,
36
+ "journal": journal,
37
+ "year": year,
38
+ "publisher": publisher
39
+ }
40
+
41
+ def scrape_gg_scholar(query, num_pages=1, start_year=None, end_year=None):
42
+ results = []
43
+
44
+ header = {
45
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
46
+ }
47
+
48
+ base_url = "https://scholar.google.com/scholar?"
49
+
50
+ params = {
51
+ "q": query.replace(" ", "+"),
52
+ "hl": "en",
53
+ "as_sdt": "0,5"
54
+ }
55
+
56
+ if start_year and end_year:
57
+ params["as_ylo"] = start_year
58
+ params["as_yhi"] = end_year
59
+
60
+ for pages in range(num_pages):
61
+ start = pages * 10
62
+ params["start"] = start
63
+
64
+ url_params = "&".join([f"{k}={v}" for k, v in params.items()])
65
+
66
+ url = base_url + url_params
67
+
68
+ try:
69
+ response = requests.get(url, headers=header)
70
+
71
+ if response.status_code == 200:
72
+ soup = BeautifulSoup(response.text, "html.parser")
73
+
74
+ articles = soup.select(".gs_r.gs_or.gs_scl")
75
+
76
+ for article in articles:
77
+ title_element = article.select_one(".gs_rt a")
78
+
79
+ if title_element:
80
+ title = title_element.text
81
+ link = title_element["href"]
82
+
83
+ abstract_element = article.select_one(".gs_rs")
84
+ abstract = abstract_element.text if abstract_element else ""
85
+ abstract = abstract.replace("…", "").strip()
86
+ abstract = abstract.replace("\n", "").strip()
87
+ abstract = " ".join(abstract.split())
88
+
89
+ pub_info_element = article.select_one(".gs_a")
90
+ pub_info = pub_info_element.text if pub_info_element else ""
91
+
92
+ pub_info_parsed = extract_data_from_publication_info(pub_info)
93
+
94
+ results.append({
95
+ "GGS_title": title,
96
+ "GGS_link": link,
97
+ "GGS_brief_abstract": abstract,
98
+ "GGS_publication_info": pub_info,
99
+ "GGS_authors": pub_info_parsed["authors"],
100
+ "GGS_journal": pub_info_parsed["journal"],
101
+ "GGS_year": pub_info_parsed["year"],
102
+ "GGS_publisher": pub_info_parsed["publisher"]
103
+ })
104
+
105
+ time.sleep(random.uniform(1, 3)) # Random sleep to avoid being blocked
106
+
107
+ else:
108
+ print(f"ERROR: STATUS CODE {response.status_code}")
109
+ break
110
+
111
+ except Exception as e:
112
+ print(f"An error occurred: {e}")
113
+ break
114
+
115
+ return results
116
+
117
+ def save_to_csv(data, filename="base_crawling_from_gg_scholar.csv"):
118
+ with open(filename, "w", newline="", encoding="utf-8-sig") as csvfile:
119
+ fieldnames = data[0].keys()
120
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
121
+
122
+ writer.writeheader()
123
+ for row in data:
124
+ writer.writerow(row)
125
+
126
+ print(f"Data saved to {filename}")
127
+
128
+ def main():
129
+ query = input("Enter your search query: ")
130
+ num_pages = int(input("Enter the number of pages to scrape: "))
131
+
132
+ use_time_filter = input("Do you want to filter by year? (y/n): ").strip().lower()
133
+ start_year = None
134
+ end_year = None
135
+
136
+ if use_time_filter == 'y':
137
+ start_year = input("Enter the start year (format: YYYY; for example: 2020): ")
138
+ end_year = input("Enter the end year (format: YYYY; for example: 2025): ")
139
+
140
+ results = scrape_gg_scholar(query, num_pages, start_year, end_year)
141
+
142
+ print(f"Found {len(results)} results.")
143
+
144
+ for i, result in enumerate(results):
145
+ print(f"{i + 1}. {result['title']}")
146
+ print(f" Link: {result['link']}")
147
+ print(f" Brief Abstract: {result['brief_abstract']}")
148
+ print(f" Publication Info: {result['publication_info']}")
149
+ print(f" Authors: {result['authors']}")
150
+ print(f" Journal: {result['journal']}")
151
+ print(f" Year: {result['year']}")
152
+ print(f" Publisher: {result['publisher']}")
153
+ print("=" * 100)
154
+
155
+ save_option = input("Do you want to save the results to a CSV file? (y/n): ").strip().lower()
156
+ if save_option == 'y':
157
+ file_name = input("Enter the filename (default file name is 'base_crawling_from_gg_scholar.csv', enter fine name without extension): ")
158
+ save_to_csv(results)
159
+
160
+ if __name__ == "__main__":
161
+ main()
162
+ # The code is designed to scrape Google Scholar for academic articles based on a search query.