Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
fc32112
1
Parent(s):
f7e5bce
add more euristics to find logos, might be too much
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- crawler/common/selectors.py +4 -1
- crawler/imtool.py +1 -1
- crawler/screenshot.py +3 -1
- src/index.ts +29 -3
- src/selectors.ts +3 -1
crawler/common/selectors.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
|
3 |
-
|
|
|
|
|
|
|
4 |
logosbancos = "img[src*=logosbancos]"
|
5 |
|
6 |
entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
|
3 |
+
img_logo = "img[src*=logo]"
|
4 |
+
id_logo = "*[id*=logo]"
|
5 |
+
cls_logo = "*[class*=logo]"
|
6 |
+
|
7 |
logosbancos = "img[src*=logosbancos]"
|
8 |
|
9 |
entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
|
crawler/imtool.py
CHANGED
@@ -9,7 +9,7 @@ from typing import NamedTuple
|
|
9 |
from entity import Entity
|
10 |
|
11 |
TILE_SIZE = 800
|
12 |
-
TILE_OVERLAP = 0.
|
13 |
|
14 |
class BoundingBox(NamedTuple):
|
15 |
x: float = 0.0
|
|
|
9 |
from entity import Entity
|
10 |
|
11 |
TILE_SIZE = 800
|
12 |
+
TILE_OVERLAP = 0.8
|
13 |
|
14 |
class BoundingBox(NamedTuple):
|
15 |
x: float = 0.0
|
crawler/screenshot.py
CHANGED
@@ -27,7 +27,9 @@ def sc_entity(e: Entity):
|
|
27 |
driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
|
28 |
driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
|
29 |
|
30 |
-
logos = driver.find_elements(By.CSS_SELECTOR, selectors.
|
|
|
|
|
31 |
with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
|
32 |
for i in logos:
|
33 |
f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
|
|
|
27 |
driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
|
28 |
driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
|
29 |
|
30 |
+
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
31 |
+
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
32 |
+
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
33 |
with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
|
34 |
for i in logos:
|
35 |
f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
|
src/index.ts
CHANGED
@@ -22,6 +22,21 @@ queue.addEventListener("idle", async () => {
|
|
22 |
console.log("all done")
|
23 |
})
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
26 |
const promises: Promise<void>[] = [];
|
27 |
|
@@ -30,11 +45,22 @@ function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
|
30 |
promises.push(new Promise<void>((accept, _reject) => {
|
31 |
page.once('load', async () => {
|
32 |
try {
|
33 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
let annotations = '';
|
35 |
for (const i in logos) {
|
36 |
-
const bb =
|
37 |
-
if (!bb
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
try {
|
40 |
await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
|
|
|
22 |
console.log("all done")
|
23 |
})
|
24 |
|
25 |
+
async function get_logos(page, selector): {}[] {
|
26 |
+
const logos = await page.$$(selector) || [];
|
27 |
+
for (const i in logos) {
|
28 |
+
const bb = await page.evaluate(e => {
|
29 |
+
const { x, y, width, height } = e.getBoundingClientRect();
|
30 |
+
return {
|
31 |
+
x, y, width, height, top: window.screen.top, left: window.screen.left
|
32 |
+
}
|
33 |
+
}, logos[i])
|
34 |
+
logos[i].box = bb;
|
35 |
+
}
|
36 |
+
return logos;
|
37 |
+
}
|
38 |
+
|
39 |
+
|
40 |
function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
41 |
const promises: Promise<void>[] = [];
|
42 |
|
|
|
45 |
promises.push(new Promise<void>((accept, _reject) => {
|
46 |
page.once('load', async () => {
|
47 |
try {
|
48 |
+
const imgs = await get_logos(page, selectors.img_logo);
|
49 |
+
const ids = await get_logos(page, selectors.id_logo);
|
50 |
+
const cls = await get_logos(page, selectors.class_logo);
|
51 |
+
const logos = [
|
52 |
+
...imgs, ...ids, ...cls
|
53 |
+
]
|
54 |
+
|
55 |
let annotations = '';
|
56 |
for (const i in logos) {
|
57 |
+
const bb = logos[i].box
|
58 |
+
if (!bb
|
59 |
+
|| (bb.width < 10)
|
60 |
+
|| (bb.height < 10)
|
61 |
+
|| (bb.x + bb.width < 0)
|
62 |
+
|| (bb.y + bb.height < 0)) continue;
|
63 |
+
console.log('got bb', o.bco, bb)
|
64 |
|
65 |
try {
|
66 |
await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
|
src/selectors.ts
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
export default {
|
2 |
-
"
|
|
|
|
|
3 |
"logosbancos": "img[src*=logosbancos]",
|
4 |
"entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
|
5 |
"entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
|
|
|
1 |
export default {
|
2 |
+
"img_logo": "img[src*=logo]",
|
3 |
+
"id_logo": "*[id*=logo]",
|
4 |
+
"class_logo": "*[class*=logo]",
|
5 |
"logosbancos": "img[src*=logosbancos]",
|
6 |
"entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
|
7 |
"entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
|