Spaces:
Sleeping
Sleeping
Delete output
Browse files
output/scan_result_20241030_210745.txt
DELETED
@@ -1,242 +0,0 @@
|
|
1 |
-
#ファイルパス
|
2 |
-
Get_URL_list/get_url_list.py
|
3 |
-
------------
|
4 |
-
import json
|
5 |
-
import requests
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
-
|
8 |
-
# Load URLs from JSON file
|
9 |
-
with open('ideabte_scraping/Get_URL_list/URL_json_output/debate_urls.json', 'r') as f:
|
10 |
-
json_urls = json.load(f)
|
11 |
-
|
12 |
-
# Function to get sub-page URLs from a main theme URL
|
13 |
-
def get_debate_topic_urls(main_url):
|
14 |
-
response = requests.get(main_url)
|
15 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
16 |
-
|
17 |
-
# Extract all links from the main URL page
|
18 |
-
links = soup.find_all('a', href=True)
|
19 |
-
|
20 |
-
# Filter for links that are debate topics
|
21 |
-
topic_urls = [link['href'] for link in links if link['href'].startswith('/')]
|
22 |
-
|
23 |
-
# Make URLs absolute
|
24 |
-
full_urls = [f"https://idebate.net{url}" for url in topic_urls if "~b" in url]
|
25 |
-
|
26 |
-
return full_urls
|
27 |
-
|
28 |
-
# Dictionary to store all debate topic URLs for each main theme URL
|
29 |
-
all_debate_topic_urls = {}
|
30 |
-
for theme_url in json_urls:
|
31 |
-
theme_name = theme_url.split("/")[-2].replace("~", "_")
|
32 |
-
all_debate_topic_urls[theme_name] = get_debate_topic_urls(theme_url)
|
33 |
-
|
34 |
-
# Output the results
|
35 |
-
with open('ideabte_scraping/Get_URL_list/output/debate_topic_urls.json', 'w') as f:
|
36 |
-
json.dump(all_debate_topic_urls, f, indent=4)
|
37 |
-
|
38 |
-
print("Debate topic URLs have been saved to debate_topic_urls.json")
|
39 |
-
|
40 |
-
#ファイルパス
|
41 |
-
scraping_idebate/run_main.sh
|
42 |
-
------------
|
43 |
-
#!/bin/bash
|
44 |
-
|
45 |
-
# Set default paths
|
46 |
-
JSON_FILE="ideabte_scraping/Get_URL_list/output/debate_topic_urls.json"
|
47 |
-
OUTPUT_DIR="ideabte_scraping/scraping_idebate/output"
|
48 |
-
|
49 |
-
# Check if the JSON file exists
|
50 |
-
if [ ! -f "$JSON_FILE" ]; then
|
51 |
-
echo "Error: JSON file '$JSON_FILE' does not exist."
|
52 |
-
exit 1
|
53 |
-
fi
|
54 |
-
|
55 |
-
# Create the output directory if it doesn't exist
|
56 |
-
mkdir -p "$OUTPUT_DIR"
|
57 |
-
|
58 |
-
# Run the Python script
|
59 |
-
python3 ideabte_scraping/scraping_idebate/src/scraping.py "$JSON_FILE" "$OUTPUT_DIR"
|
60 |
-
|
61 |
-
echo "Scraping completed. Output files are stored in $OUTPUT_DIR"
|
62 |
-
|
63 |
-
#ファイルパス
|
64 |
-
scraping_idebate/src/scraping.py
|
65 |
-
------------
|
66 |
-
import requests
|
67 |
-
from bs4 import BeautifulSoup
|
68 |
-
import json
|
69 |
-
import os
|
70 |
-
import sys
|
71 |
-
from urllib.parse import urlparse
|
72 |
-
|
73 |
-
def scrape_url(url, output_dir):
|
74 |
-
response = requests.get(url)
|
75 |
-
response.raise_for_status()
|
76 |
-
|
77 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
78 |
-
topic = soup.find("h1", class_="blog-post__title").get_text(strip=True)
|
79 |
-
|
80 |
-
points_list = []
|
81 |
-
|
82 |
-
def extract_points(section, section_name):
|
83 |
-
accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
|
84 |
-
for item in accordion_items:
|
85 |
-
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
|
86 |
-
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
|
87 |
-
points_list.append({
|
88 |
-
"topic": topic,
|
89 |
-
"section": section_name,
|
90 |
-
"context": f"**{point_subtitle}**\n{point_body}"
|
91 |
-
})
|
92 |
-
|
93 |
-
points_for_section = soup.find('div', class_='points-vote points-vote--for')
|
94 |
-
if points_for_section:
|
95 |
-
extract_points(points_for_section, "Points For")
|
96 |
-
|
97 |
-
points_against_section = soup.find('div', class_='points-vote points-vote--against')
|
98 |
-
if points_against_section:
|
99 |
-
extract_points(points_against_section, "Points Against")
|
100 |
-
|
101 |
-
# Generate a unique filename based on the URL
|
102 |
-
parsed_url = urlparse(url)
|
103 |
-
filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json"
|
104 |
-
output_path = os.path.join(output_dir, filename)
|
105 |
-
|
106 |
-
with open(output_path, "w", encoding="utf-8") as f:
|
107 |
-
json.dump(points_list, f, ensure_ascii=False, indent=4)
|
108 |
-
|
109 |
-
print(f"Data saved to {output_path}")
|
110 |
-
|
111 |
-
if __name__ == "__main__":
|
112 |
-
if len(sys.argv) != 3:
|
113 |
-
print("Usage: python script.py <json_file> <output_dir>")
|
114 |
-
sys.exit(1)
|
115 |
-
|
116 |
-
json_file = sys.argv[1]
|
117 |
-
output_dir = sys.argv[2]
|
118 |
-
|
119 |
-
os.makedirs(output_dir, exist_ok=True)
|
120 |
-
|
121 |
-
with open(json_file, 'r') as f:
|
122 |
-
url_data = json.load(f)
|
123 |
-
|
124 |
-
for category, urls in url_data.items():
|
125 |
-
for url in urls:
|
126 |
-
try:
|
127 |
-
scrape_url(url, output_dir)
|
128 |
-
except Exception as e:
|
129 |
-
print(f"Error scraping {url}: {str(e)}")
|
130 |
-
|
131 |
-
#ファイルパス
|
132 |
-
scraping_idebate/src/scraping_test.py
|
133 |
-
------------
|
134 |
-
import requests
|
135 |
-
from bs4 import BeautifulSoup
|
136 |
-
|
137 |
-
url = "https://idebate.net/this-house-would-make-all-museums-free-of-charge~b641/"
|
138 |
-
|
139 |
-
# ウェブページを取得
|
140 |
-
response = requests.get(url)
|
141 |
-
response.raise_for_status() # エラーチェック
|
142 |
-
|
143 |
-
# HTMLを解析
|
144 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
145 |
-
|
146 |
-
# Points Forのdiv要素を取得
|
147 |
-
points_for_section = soup.find('div', class_='points-vote points-vote--for')
|
148 |
-
|
149 |
-
# ポイントを含むアコーディオン要素を取得
|
150 |
-
accordion_items = points_for_section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
|
151 |
-
|
152 |
-
# 各ポイントのテキストを抽出
|
153 |
-
points = []
|
154 |
-
for item in accordion_items:
|
155 |
-
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
|
156 |
-
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
|
157 |
-
points.append(f"**{point_subtitle}**\n{point_body}")
|
158 |
-
|
159 |
-
# 抽出したポイントを出力
|
160 |
-
for point in points:
|
161 |
-
print(point)
|
162 |
-
print("-" * 20) # 区切り線
|
163 |
-
|
164 |
-
|
165 |
-
#ファイルパス
|
166 |
-
scraping_idebate/src/scraping_tqdm.py
|
167 |
-
------------
|
168 |
-
import requests
|
169 |
-
from bs4 import BeautifulSoup
|
170 |
-
import json
|
171 |
-
import os
|
172 |
-
import sys
|
173 |
-
from urllib.parse import urlparse
|
174 |
-
from tqdm import tqdm
|
175 |
-
|
176 |
-
def scrape_url(url, output_dir):
|
177 |
-
response = requests.get(url)
|
178 |
-
response.raise_for_status()
|
179 |
-
|
180 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
181 |
-
topic = soup.find("h1", class_="blog-post__title").get_text(strip=True)
|
182 |
-
|
183 |
-
points_list = []
|
184 |
-
|
185 |
-
def extract_points(section, section_name):
|
186 |
-
accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
|
187 |
-
for item in accordion_items:
|
188 |
-
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
|
189 |
-
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
|
190 |
-
points_list.append({
|
191 |
-
"topic": topic,
|
192 |
-
"section": section_name,
|
193 |
-
"context": f"**{point_subtitle}**\n{point_body}"
|
194 |
-
})
|
195 |
-
|
196 |
-
points_for_section = soup.find('div', class_='points-vote points-vote--for')
|
197 |
-
if points_for_section:
|
198 |
-
extract_points(points_for_section, "Points For")
|
199 |
-
|
200 |
-
points_against_section = soup.find('div', class_='points-vote points-vote--against')
|
201 |
-
if points_against_section:
|
202 |
-
extract_points(points_against_section, "Points Against")
|
203 |
-
|
204 |
-
# Generate a unique filename based on the URL
|
205 |
-
parsed_url = urlparse(url)
|
206 |
-
filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json"
|
207 |
-
output_path = os.path.join(output_dir, filename)
|
208 |
-
|
209 |
-
with open(output_path, "w", encoding="utf-8") as f:
|
210 |
-
json.dump(points_list, f, ensure_ascii=False, indent=4)
|
211 |
-
|
212 |
-
return output_path
|
213 |
-
|
214 |
-
if __name__ == "__main__":
|
215 |
-
if len(sys.argv) != 3:
|
216 |
-
print("Usage: python script.py <json_file> <output_dir>")
|
217 |
-
sys.exit(1)
|
218 |
-
|
219 |
-
json_file = sys.argv[1]
|
220 |
-
output_dir = sys.argv[2]
|
221 |
-
|
222 |
-
os.makedirs(output_dir, exist_ok=True)
|
223 |
-
|
224 |
-
with open(json_file, 'r') as f:
|
225 |
-
url_data = json.load(f)
|
226 |
-
|
227 |
-
total_urls = sum(len(urls) for urls in url_data.values())
|
228 |
-
|
229 |
-
with tqdm(total=total_urls, desc="Scraping Progress") as pbar:
|
230 |
-
for category, urls in url_data.items():
|
231 |
-
for url in urls:
|
232 |
-
try:
|
233 |
-
output_path = scrape_url(url, output_dir)
|
234 |
-
pbar.set_postfix_str(f"Saved: {output_path}")
|
235 |
-
pbar.update(1)
|
236 |
-
except Exception as e:
|
237 |
-
pbar.set_postfix_str(f"Error: {url}")
|
238 |
-
print(f"\nError scraping {url}: {str(e)}")
|
239 |
-
pbar.update(1)
|
240 |
-
|
241 |
-
print("\nScraping completed. All data saved to the output directory.")
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|