DeL-TaiseiOzaki commited on
Commit
d32c21c
·
verified ·
1 Parent(s): c07ea46

Delete output

Browse files
output/scan_result_20241030_210745.txt DELETED
@@ -1,242 +0,0 @@
1
- #ファイルパス
2
- Get_URL_list/get_url_list.py
3
- ------------
4
- import json
5
- import requests
6
- from bs4 import BeautifulSoup
7
-
8
- # Load URLs from JSON file
9
- with open('ideabte_scraping/Get_URL_list/URL_json_output/debate_urls.json', 'r') as f:
10
- json_urls = json.load(f)
11
-
12
- # Function to get sub-page URLs from a main theme URL
13
- def get_debate_topic_urls(main_url):
14
- response = requests.get(main_url)
15
- soup = BeautifulSoup(response.text, 'html.parser')
16
-
17
- # Extract all links from the main URL page
18
- links = soup.find_all('a', href=True)
19
-
20
- # Filter for links that are debate topics
21
- topic_urls = [link['href'] for link in links if link['href'].startswith('/')]
22
-
23
- # Make URLs absolute
24
- full_urls = [f"https://idebate.net{url}" for url in topic_urls if "~b" in url]
25
-
26
- return full_urls
27
-
28
- # Dictionary to store all debate topic URLs for each main theme URL
29
- all_debate_topic_urls = {}
30
- for theme_url in json_urls:
31
- theme_name = theme_url.split("/")[-2].replace("~", "_")
32
- all_debate_topic_urls[theme_name] = get_debate_topic_urls(theme_url)
33
-
34
- # Output the results
35
- with open('ideabte_scraping/Get_URL_list/output/debate_topic_urls.json', 'w') as f:
36
- json.dump(all_debate_topic_urls, f, indent=4)
37
-
38
- print("Debate topic URLs have been saved to debate_topic_urls.json")
39
-
40
- #ファイルパス
41
- scraping_idebate/run_main.sh
42
- ------------
43
- #!/bin/bash
44
-
45
- # Set default paths
46
- JSON_FILE="ideabte_scraping/Get_URL_list/output/debate_topic_urls.json"
47
- OUTPUT_DIR="ideabte_scraping/scraping_idebate/output"
48
-
49
- # Check if the JSON file exists
50
- if [ ! -f "$JSON_FILE" ]; then
51
- echo "Error: JSON file '$JSON_FILE' does not exist."
52
- exit 1
53
- fi
54
-
55
- # Create the output directory if it doesn't exist
56
- mkdir -p "$OUTPUT_DIR"
57
-
58
- # Run the Python script
59
- python3 ideabte_scraping/scraping_idebate/src/scraping.py "$JSON_FILE" "$OUTPUT_DIR"
60
-
61
- echo "Scraping completed. Output files are stored in $OUTPUT_DIR"
62
-
63
- #ファイルパス
64
- scraping_idebate/src/scraping.py
65
- ------------
66
- import requests
67
- from bs4 import BeautifulSoup
68
- import json
69
- import os
70
- import sys
71
- from urllib.parse import urlparse
72
-
73
- def scrape_url(url, output_dir):
74
- response = requests.get(url)
75
- response.raise_for_status()
76
-
77
- soup = BeautifulSoup(response.content, 'html.parser')
78
- topic = soup.find("h1", class_="blog-post__title").get_text(strip=True)
79
-
80
- points_list = []
81
-
82
- def extract_points(section, section_name):
83
- accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
84
- for item in accordion_items:
85
- point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
86
- point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
87
- points_list.append({
88
- "topic": topic,
89
- "section": section_name,
90
- "context": f"**{point_subtitle}**\n{point_body}"
91
- })
92
-
93
- points_for_section = soup.find('div', class_='points-vote points-vote--for')
94
- if points_for_section:
95
- extract_points(points_for_section, "Points For")
96
-
97
- points_against_section = soup.find('div', class_='points-vote points-vote--against')
98
- if points_against_section:
99
- extract_points(points_against_section, "Points Against")
100
-
101
- # Generate a unique filename based on the URL
102
- parsed_url = urlparse(url)
103
- filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json"
104
- output_path = os.path.join(output_dir, filename)
105
-
106
- with open(output_path, "w", encoding="utf-8") as f:
107
- json.dump(points_list, f, ensure_ascii=False, indent=4)
108
-
109
- print(f"Data saved to {output_path}")
110
-
111
- if __name__ == "__main__":
112
- if len(sys.argv) != 3:
113
- print("Usage: python script.py <json_file> <output_dir>")
114
- sys.exit(1)
115
-
116
- json_file = sys.argv[1]
117
- output_dir = sys.argv[2]
118
-
119
- os.makedirs(output_dir, exist_ok=True)
120
-
121
- with open(json_file, 'r') as f:
122
- url_data = json.load(f)
123
-
124
- for category, urls in url_data.items():
125
- for url in urls:
126
- try:
127
- scrape_url(url, output_dir)
128
- except Exception as e:
129
- print(f"Error scraping {url}: {str(e)}")
130
-
131
- #ファイルパス
132
- scraping_idebate/src/scraping_test.py
133
- ------------
134
- import requests
135
- from bs4 import BeautifulSoup
136
-
137
- url = "https://idebate.net/this-house-would-make-all-museums-free-of-charge~b641/"
138
-
139
- # ウェブページを取得
140
- response = requests.get(url)
141
- response.raise_for_status() # エラーチェック
142
-
143
- # HTMLを解析
144
- soup = BeautifulSoup(response.content, 'html.parser')
145
-
146
- # Points Forのdiv要素を取得
147
- points_for_section = soup.find('div', class_='points-vote points-vote--for')
148
-
149
- # ポイントを含むアコーディオン要素を取得
150
- accordion_items = points_for_section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
151
-
152
- # 各ポイントのテキストを抽出
153
- points = []
154
- for item in accordion_items:
155
- point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
156
- point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
157
- points.append(f"**{point_subtitle}**\n{point_body}")
158
-
159
- # 抽出したポイントを出力
160
- for point in points:
161
- print(point)
162
- print("-" * 20) # 区切り線
163
-
164
-
165
- #ファイルパス
166
- scraping_idebate/src/scraping_tqdm.py
167
- ------------
168
- import requests
169
- from bs4 import BeautifulSoup
170
- import json
171
- import os
172
- import sys
173
- from urllib.parse import urlparse
174
- from tqdm import tqdm
175
-
176
- def scrape_url(url, output_dir):
177
- response = requests.get(url)
178
- response.raise_for_status()
179
-
180
- soup = BeautifulSoup(response.content, 'html.parser')
181
- topic = soup.find("h1", class_="blog-post__title").get_text(strip=True)
182
-
183
- points_list = []
184
-
185
- def extract_points(section, section_name):
186
- accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
187
- for item in accordion_items:
188
- point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
189
- point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
190
- points_list.append({
191
- "topic": topic,
192
- "section": section_name,
193
- "context": f"**{point_subtitle}**\n{point_body}"
194
- })
195
-
196
- points_for_section = soup.find('div', class_='points-vote points-vote--for')
197
- if points_for_section:
198
- extract_points(points_for_section, "Points For")
199
-
200
- points_against_section = soup.find('div', class_='points-vote points-vote--against')
201
- if points_against_section:
202
- extract_points(points_against_section, "Points Against")
203
-
204
- # Generate a unique filename based on the URL
205
- parsed_url = urlparse(url)
206
- filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json"
207
- output_path = os.path.join(output_dir, filename)
208
-
209
- with open(output_path, "w", encoding="utf-8") as f:
210
- json.dump(points_list, f, ensure_ascii=False, indent=4)
211
-
212
- return output_path
213
-
214
- if __name__ == "__main__":
215
- if len(sys.argv) != 3:
216
- print("Usage: python script.py <json_file> <output_dir>")
217
- sys.exit(1)
218
-
219
- json_file = sys.argv[1]
220
- output_dir = sys.argv[2]
221
-
222
- os.makedirs(output_dir, exist_ok=True)
223
-
224
- with open(json_file, 'r') as f:
225
- url_data = json.load(f)
226
-
227
- total_urls = sum(len(urls) for urls in url_data.values())
228
-
229
- with tqdm(total=total_urls, desc="Scraping Progress") as pbar:
230
- for category, urls in url_data.items():
231
- for url in urls:
232
- try:
233
- output_path = scrape_url(url, output_dir)
234
- pbar.set_postfix_str(f"Saved: {output_path}")
235
- pbar.update(1)
236
- except Exception as e:
237
- pbar.set_postfix_str(f"Error: {url}")
238
- print(f"\nError scraping {url}: {str(e)}")
239
- pbar.update(1)
240
-
241
- print("\nScraping completed. All data saved to the output directory.")
242
-