|
|
selenium_crawl_tsv.py - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository |
|
|
 |
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons (git://bitreich.org) |
|
|
 |
Log |
|
|
 |
Files |
|
|
 |
Refs |
|
|
 |
Tags |
|
|
|
--- |
|
|
|
selenium_crawl_tsv.py (3006B) |
|
|
|
--- |
|
|
|
1 from selenium import webdriver |
|
|
|
2 from selenium.webdriver.common.by import By |
|
|
|
3 |
|
|
|
4 from selenium.webdriver.firefox.options import Options |
|
|
|
5 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile |
|
|
|
6 |
|
|
|
7 import sys |
|
|
|
8 from datetime import datetime |
|
|
|
9 |
|
|
|
10 def make_escape_content_trans(): |
|
|
|
11 m = {} |
|
|
|
12 for i in range(0, 32): |
|
|
|
13 m[i] = "" |
|
|
|
14 m[0x7f] = "" # DEL |
|
|
|
15 # replace |
|
|
|
16 m["\\"] = "\\\\" |
|
|
|
17 m["\n"] = "\\n" |
|
|
|
18 m["\t"] = "\\t" |
|
|
|
19 |
|
|
|
20 return str.maketrans(m) |
|
|
|
21 |
|
|
|
22 def make_escape_field_trans(): |
|
|
|
23 m = {} |
|
|
|
24 for i in range(0, 32): |
|
|
|
25 m[i] = "" |
|
|
|
26 m[0x7f] = "" # DEL |
|
|
|
27 # replace |
|
|
|
28 m["\n"] = " " |
|
|
|
29 m["\t"] = " " |
|
|
|
30 |
|
|
|
31 return str.maketrans(m) |
|
|
|
32 |
|
|
|
33 escape_content_tbl = make_escape_content_trans() |
|
|
|
34 escape_field_tbl = make_escape_field_trans() |
|
|
|
35 |
|
|
|
36 def escape_content(s): |
|
|
|
37 return s.translate(escape_content_tbl).strip() |
|
|
|
38 |
|
|
|
39 def escape_field(s): |
|
|
|
40 return s.translate(escape_field_tbl).strip() |
|
|
|
41 |
|
|
|
42 if len(sys.argv) > 1: |
|
|
|
43 url = sys.argv[1] |
|
|
|
44 else: |
|
|
|
45 print("usage: <url>") |
|
|
|
46 sys.exit(1) |
|
|
|
47 |
|
|
|
48 options = Options() |
|
|
|
49 options.add_argument("--headless") |
|
|
|
50 |
|
|
|
51 # use existing profile: |
|
|
|
52 |
|
|
|
53 #options.add_argument("--profile") |
|
|
|
54 #profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release" |
|
|
|
55 # NOTE: must not be running at the same time. |
|
|
|
56 #options.add_argument(profile_path) |
|
|
|
57 #options.set_preference("profile", profile_path) |
|
|
|
58 |
|
|
|
59 # setup custom profile: |
|
|
|
60 # JS disabled |
|
|
|
61 options.set_preference("javascript.enabled", False) |
|
|
|
62 # disable stylesheet |
|
|
|
63 options.set_preference("permissions.default.stylesheet", 2) |
|
|
|
64 # disable image loading |
|
|
|
65 options.set_preference("permissions.default.image", 2) |
|
|
|
66 # override user-agent. |
|
|
|
67 #options.set_preference("general.useragent.override", "whatever you want") |
|
|
|
68 |
|
|
|
69 driver = webdriver.Firefox(options=options) |
|
|
|
70 |
|
|
|
71 # set timeouts |
|
|
|
72 #driver.implicitly_wait(10) |
|
|
|
73 |
|
|
|
74 # get the page |
|
|
|
75 driver.get(url) |
|
|
|
76 |
|
|
|
77 # print page title |
|
|
|
78 #print(driver.title) |
|
|
|
79 |
|
|
|
80 #pagesource = driver.execute_script("return document.body.InnerHTML;") |
|
|
|
81 #print(pagesource) |
|
|
|
82 #print(driver.page_source) |
|
|
|
83 #outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML") |
|
|
|
84 |
|
|
|
85 #outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTML") |
|
|
|
86 #print(outer_html) |
|
|
|
87 |
|
|
|
88 # show all links on a page |
|
|
|
89 #links = driver.find_elements(By.TAG_NAME, "a") |
|
|
|
90 anchors = driver.find_elements(By.CSS_SELECTOR, "main a") |
|
|
|
91 links = [] |
|
|
|
92 for anchor in anchors: |
|
|
|
93 href = anchor.get_attribute("href") |
|
|
|
94 text = anchor.text |
|
|
|
95 if len(href): |
|
|
|
96 links.append({"href": href, "text": text}) |
|
|
|
97 |
|
|
|
98 for link in links: |
|
|
|
99 driver.get(link["href"]) |
|
|
|
100 |
|
|
|
101 # parse timestamp. |
|
|
|
102 time = driver.find_element(By.TAG_NAME, "time") |
|
|
|
103 ts = datetime.strptime(time.text, "%Y-%m-%d") |
|
|
|
104 ts = int(ts.timestamp()) |
|
|
|
105 |
|
|
|
106 content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("outerHTML") |
|
|
|
107 title = driver.title |
|
|
|
108 title = title.replace(" - Codemadness", "") |
|
|
|
109 |
|
|
|
110 # escape fields |
|
|
|
111 content = escape_content(content) |
|
|
|
112 title = escape_field(title) |
|
|
|
113 link = escape_field(link["href"]) |
|
|
|
114 |
|
|
|
115 print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content)) |
|
|
|
116 |
|
|
|
117 driver.close() |
|
|
|
118 driver.quit() |
|