jonathanjordan21 commited on
Commit
f0d4b40
·
verified ·
1 Parent(s): 7e1d282

Create extract.py

Browse files
Files changed (1) hide show
  1. extract.py +180 -0
extract.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.common.exceptions import WebDriverException
3
+ from PIL import Image
4
+ from io import BytesIO
5
+
6
+ import time
7
+
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.support.ui import WebDriverWait
10
+ from selenium.webdriver.support import expected_conditions as EC
11
+
12
+ def take_webdata(url):
13
+ options = webdriver.ChromeOptions()
14
+ options.add_argument('--headless')
15
+ options.add_argument('--no-sandbox')
16
+ options.add_argument('--disable-dev-shm-usage')
17
+
18
+ try:
19
+ wd = webdriver.Chrome(options=options)
20
+ wd.set_window_size(1080, 720) # Adjust the window size here
21
+ wd.get(url)
22
+ wd.implicitly_wait(5)
23
+ # Get the page title
24
+ page_title = wd.title
25
+ screenshot = wd.get_screenshot_as_png()
26
+
27
+ except WebDriverException as e:
28
+ return Image.new('RGB', (1, 1)), page_title
29
+ finally:
30
+ if wd:
31
+ wd.quit()
32
+
33
+ return Image.open(BytesIO(screenshot)) , page_title
34
+
35
+
36
+ def scrape_vehicle(driver):
37
+ data_kendaraan = {}
38
+ try:
39
+ rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
40
+ for row in rows:
41
+ cols = row.find_elements(By.TAG_NAME, "td")
42
+ if len(cols) >= 3:
43
+ key = cols[0].text.strip().lower().replace(".", "").replace(" ", "_")
44
+ value = cols[2].text.strip()
45
+ data_kendaraan[key] = value
46
+ except Exception as e:
47
+ print("Gagal parsing tabel:", e)
48
+
49
+ # rincians = []
50
+ # try:
51
+ # container = driver.find_element(By.ID, "det_pkb")
52
+ # rows = container.find_elements(By.CLASS_NAME, "row")
53
+ # for row in rows[1:]: # skip header
54
+ # cols = row.find_elements(By.TAG_NAME, "p")
55
+ # if len(cols) >= 3:
56
+ # rincian = {
57
+ # "pokok": cols[0].text.strip(),
58
+ # "denda": cols[1].text.strip(),
59
+ # "total": cols[2].text.strip(),
60
+ # }
61
+ # if len(cols) > 3:
62
+ # rincian["jenis"] = cols[3].text.strip().upper()
63
+ # rincians.append(rincian)
64
+ # except Exception as e:
65
+ # print("Gagal parsing det_pkb:", e)
66
+
67
+ total_tagihan = []
68
+ try:
69
+ all_rows = driver.find_elements(By.CSS_SELECTOR, "div.row")
70
+ for row in all_rows:
71
+ print("[ROW TOTAL]", row.text)
72
+ if not ("Pokok" in row.text or "Denda" in row.text or "Total" in row.text):
73
+ cols = row.find_elements(By.TAG_NAME, "p")
74
+ print("[COLS TOTAL]", [x.text for x in cols])
75
+ if len(cols) >= 4:
76
+ total_tagihan.append({
77
+ "pokok": cols[0].text.strip(),
78
+ "denda": cols[1].text.strip(),
79
+ "total": cols[2].text.strip(),
80
+ "jenis": cols[3].text.strip()
81
+ })
82
+ except Exception as e:
83
+ print("Gagal parsing total tagihan:", e)
84
+
85
+
86
+ rincians_pkb = []
87
+ try:
88
+ pkb_rows = driver.find_elements(By.CSS_SELECTOR, "#det_pkb .row")[1:] # skip header
89
+ for row in pkb_rows:
90
+ print("[ROW PKB]", row.text)
91
+ cols = row.find_elements(By.TAG_NAME, "p")
92
+ print("[COLS PKB]", [x.text for x in cols])
93
+ if len(cols) >= 3:
94
+ rincians_pkb.append({
95
+ "pokok": cols[0].text.strip(),
96
+ "denda": cols[1].text.strip(),
97
+ "total": cols[2].text.strip()
98
+ })
99
+ except Exception as e:
100
+ print("Gagal parsing det_pkb:", e)
101
+
102
+
103
+ rincians_swd = []
104
+ try:
105
+ swd_rows = driver.find_elements(By.CSS_SELECTOR, "#det_swd .row")[1:] # skip header
106
+ for row in swd_rows:
107
+ print("[ROW SWD]", row.text)
108
+ cols = row.find_elements(By.TAG_NAME, "p")
109
+ print("[COLS SWD]", [x.text for x in cols])
110
+ if len(cols) >= 3:
111
+ rincians_swd.append({
112
+ "pokok": cols[0].text.strip(),
113
+ "denda": cols[1].text.strip(),
114
+ "total": cols[2].text.strip()
115
+ })
116
+ except Exception as e:
117
+ print("Gagal parsing det_swd:", e)
118
+
119
+ # rincians = [total_tagihan, rincians_pkb, rincians_swd]
120
+ # return data_kendaraan, rincians
121
+ return data_kendaraan, total_tagihan, rincians_pkb, rincians_swd
122
+
123
+
124
+ def get_vehicle_info(plate_number: str):
125
+ # Configure headless Chrome
126
+ options = webdriver.ChromeOptions()
127
+ options.add_argument("--headless")
128
+ options.add_argument("--disable-gpu")
129
+ options.add_argument("--no-sandbox")
130
+
131
+ # Path to chromedriver (adjust if needed)
132
+ driver = webdriver.Chrome(options=options)
133
+
134
+ try:
135
+ driver.get("https://www.jambisamsat.net/infopkb.html")
136
+ time.sleep(1)
137
+
138
+ WebDriverWait(driver, 10).until(
139
+ EC.presence_of_element_located((By.ID, "no_polisi"))
140
+ )
141
+
142
+ input_field = driver.find_element(By.ID, "no_polisi")
143
+ input_field.clear()
144
+ input_field.send_keys(plate_number)
145
+
146
+ submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
147
+ submit_button.click()
148
+
149
+ # Wait for the new page to load
150
+ WebDriverWait(driver, 10).until(
151
+ EC.url_contains("infopkb.php")
152
+ )
153
+
154
+ driver.implicitly_wait(3)
155
+
156
+ scroll_height = driver.execute_script("return document.body.scrollHeight")
157
+ driver.set_window_size(1920, scroll_height + 200) # force full-page height
158
+
159
+ button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "show_det_pkb")))
160
+ button.click()
161
+
162
+ button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "show_det_swd")))
163
+ button.click()
164
+
165
+ time.sleep(0.6)
166
+
167
+ return scrape_vehicle(driver)
168
+
169
+ # print(data_kendaraan, rincian)
170
+
171
+ # page_title = driver.title
172
+ # screenshot = driver.get_screenshot_as_png()
173
+
174
+ # return Image.open(BytesIO(screenshot)) , page_title
175
+
176
+ except WebDriverException as e:
177
+ return Image.new('RGB', (1, 1)), page_title
178
+
179
+ finally:
180
+ driver.quit()