|
|
eomyidae - eomyidae - a gopher crawler software |
|
|
 |
git clone git://bitreich.org/eomyidae (git://bitreich.org) |
|
|
 |
Log |
|
|
 |
Files |
|
|
 |
Refs |
|
|
 |
Tags |
|
|
 |
README |
|
|
 |
LICENSE |
|
|
|
--- |
|
|
|
eomyidae (15243B) |
|
|
|
--- |
|
|
|
1 #!/usr/bin/env python |
|
|
|
2 # coding=utf-8 |
|
|
|
3 # |
|
|
|
4 # See the LICENSE file for details. |
|
|
|
5 # |
|
|
|
6 |
|
|
|
7 import os |
|
|
|
8 import sys |
|
|
|
9 import getopt |
|
|
|
10 import urllib.parse |
|
|
|
11 import socket |
|
|
|
12 import io |
|
|
|
13 import pickle |
|
|
|
14 import time |
|
|
|
15 import hashlib |
|
|
|
16 import errno |
|
|
|
17 import random |
|
|
|
18 import operator |
|
|
|
19 import math |
|
|
|
20 from multiprocessing import Pool |
|
|
|
21 from datetime import datetime |
|
|
|
22 from datetime import timedelta |
|
|
|
23 |
|
|
|
24 def parseuri(uri): |
|
|
|
25 urls = urllib.parse.urlparse(uri, allow_fragments=False) |
|
|
|
26 if ":" in urls.netloc: |
|
|
|
27 (host, port) = urls.netloc.split(":")[:2] |
|
|
|
28 else: |
|
|
|
29 host = urls.netloc |
|
|
|
30 port = 70 |
|
|
|
31 |
|
|
|
32 mtype = "1" |
|
|
|
33 if len(urls.path) > 1: |
|
|
|
34 mtype = urls.path[1] |
|
|
|
35 |
|
|
|
36 if len(urls.path) > 2: |
|
|
|
37 if len(urls.query) > 0: |
|
|
|
38 selector = "%s?%s" % (urls.path[2:], urls.query) |
|
|
|
39 else: |
|
|
|
40 selector = urls.path[2:] |
|
|
|
41 else: |
|
|
|
42 selector = "" |
|
|
|
43 |
|
|
|
44 return (host, port, mtype, selector) |
|
|
|
45 |
|
|
|
46 def poolgopher(req): |
|
|
|
47 data = gopher(req[0], req[1], req[2], req[3]) |
|
|
|
48 req.append(data) |
|
|
|
49 return req |
|
|
|
50 |
|
|
|
51 def gopher(uri=None, host=None, port=70, selector=""): |
|
|
|
52 #print("gopher(uri = %s, host = %s, port = %d, selector = %s)" % \ |
|
|
|
53 # (uri, host, port, selector)) |
|
|
|
54 if uri != None: |
|
|
|
55 (host, port, mtype, selector) = parseuri(uri) |
|
|
|
56 port = int(port) |
|
|
|
57 |
|
|
|
58 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
|
|
|
59 s.settimeout(20) |
|
|
|
60 try: |
|
|
|
61 s.connect((host, port)) |
|
|
|
62 except socket.gaierror: |
|
|
|
63 return "" |
|
|
|
64 except socket.timeout: |
|
|
|
65 return "" |
|
|
|
66 except TimeoutError: |
|
|
|
67 return "" |
|
|
|
68 except ConnectionResetError: |
|
|
|
69 return "" |
|
|
|
70 except OverflowError: |
|
|
|
71 return "" |
|
|
|
72 except OSError as e: |
|
|
|
73 # No route to host. |
|
|
|
74 if e.errno == 113: |
|
|
|
75 return "" |
|
|
|
76 |
|
|
|
77 try: |
|
|
|
78 s.send(("%s\r\n" % (selector)).encode("utf-8")) |
|
|
|
79 except BrokenPipeError: |
|
|
|
80 return "" |
|
|
|
81 |
|
|
|
82 fd = s.makefile("b") |
|
|
|
83 try: |
|
|
|
84 data = fd.read() |
|
|
|
85 except socket.timeout: |
|
|
|
86 fd.close() |
|
|
|
87 return "" |
|
|
|
88 except ConnectionResetError: |
|
|
|
89 fd.close() |
|
|
|
90 return "" |
|
|
|
91 fd.close() |
|
|
|
92 |
|
|
|
93 try: |
|
|
|
94 content = data.decode(errors='replace') |
|
|
|
95 except UnicodeDecodeError: |
|
|
|
96 content = data.decode("iso-8859-1") |
|
|
|
97 |
|
|
|
98 return content |
|
|
|
99 |
|
|
|
100 def parsemenu(data): |
|
|
|
101 menu = [] |
|
|
|
102 lines = data.split("\n") |
|
|
|
103 for line in lines: |
|
|
|
104 line = line.strip() |
|
|
|
105 if len(line) < 1: |
|
|
|
106 continue |
|
|
|
107 |
|
|
|
108 mtype = line[0] |
|
|
|
109 |
|
|
|
110 # Last entry |
|
|
|
111 if mtype == ".": |
|
|
|
112 break |
|
|
|
113 |
|
|
|
114 elements = line[1:].split("\t") |
|
|
|
115 if len(elements) < 4: |
|
|
|
116 continue |
|
|
|
117 (description, selector, host, port) = elements[:4] |
|
|
|
118 menu.append([mtype, description, selector, host, port]) |
|
|
|
119 |
|
|
|
120 return menu |
|
|
|
121 |
|
|
|
122 def menu2text(menu): |
|
|
|
123 text = "" |
|
|
|
124 for entry in menu: |
|
|
|
125 if type(entry[1]) != str: |
|
|
|
126 continue |
|
|
|
127 |
|
|
|
128 text += "%s\n" % (entry[1]) |
|
|
|
129 |
|
|
|
130 return text |
|
|
|
131 |
|
|
|
132 ## Robots.txt |
|
|
|
133 # https://en.wikipedia.org/wiki/Robots.txt |
|
|
|
134 # # Comment |
|
|
|
135 # User-agent: somebot |
|
|
|
136 # Disallow: /path |
|
|
|
137 # Allow: /path |
|
|
|
138 # Crawl-delay: seconds |
|
|
|
139 def parserobots(data): |
|
|
|
140 robots = [] |
|
|
|
141 lines = data.split("\n") |
|
|
|
142 for line in lines: |
|
|
|
143 line = line.strip() |
|
|
|
144 if "#" in line: |
|
|
|
145 (line, comment) = line.split("#", 1) |
|
|
|
146 if len(line) < 0: |
|
|
|
147 # Empty line, needed for bot-specific rules. |
|
|
|
148 robots.append(["",""]) |
|
|
|
149 continue |
|
|
|
150 if not ":" in line: |
|
|
|
151 continue |
|
|
|
152 |
|
|
|
153 (header, value) = line.strip().split(":", 1) |
|
|
|
154 value = value.strip().lower() |
|
|
|
155 header = header.strip().lower() |
|
|
|
156 robots.append([header, value]) |
|
|
|
157 return robots |
|
|
|
158 |
|
|
|
159 def adaptrobots(robotsdata): |
|
|
|
160 filterlines = {} |
|
|
|
161 robotslines = parserobots(robotsdata) |
|
|
|
162 i = 0 |
|
|
|
163 |
|
|
|
164 allowlines = [] |
|
|
|
165 disallowlines = [] |
|
|
|
166 otherlines = [] |
|
|
|
167 iseomyidae = False |
|
|
|
168 while i < len(robotslines): |
|
|
|
169 header = robotslines[i][0].lower() |
|
|
|
170 value = robotslines[i][1] |
|
|
|
171 if header == "user-agent": |
|
|
|
172 ua = value.split("/") |
|
|
|
173 if ua[0] == "eomyidae" or ua[0] == "*": |
|
|
|
174 iseomyidae = 1 |
|
|
|
175 else: |
|
|
|
176 iseomyidae = 0 |
|
|
|
177 elif header == "allow" and iseomyidae == True: |
|
|
|
178 allowlines.append(value) |
|
|
|
179 elif header == "disallow" and iseomyidae == True: |
|
|
|
180 disallowlines.append(value) |
|
|
|
181 elif header == "": |
|
|
|
182 iseomyidae = False |
|
|
|
183 else: |
|
|
|
184 if iseomyidae == True: |
|
|
|
185 otherlines.append([header, value]) |
|
|
|
186 i += 1 |
|
|
|
187 |
|
|
|
188 filterlines["allow"] = allowlines |
|
|
|
189 filterlines["disallow"] = disallowlines |
|
|
|
190 filterlines["other"] = otherlines |
|
|
|
191 if len(allowlines) > 0 or len(disallowlines) > 0 \ |
|
|
|
192 or len(otherlines) > 0: |
|
|
|
193 filterlines["empty"] = False |
|
|
|
194 else: |
|
|
|
195 filterlines["empty"] = True |
|
|
|
196 |
|
|
|
197 return filterlines |
|
|
|
198 |
|
|
|
199 def mkpath(cachepath): |
|
|
|
200 try: |
|
|
|
201 os.makedirs(cachepath) |
|
|
|
202 except OSError as e: |
|
|
|
203 if e.errno != errno.EEXIST: |
|
|
|
204 raise |
|
|
|
205 |
|
|
|
206 def mkopen(cachefile): |
|
|
|
207 if not os.path.exists(cachefile): |
|
|
|
208 fd = open(cachefile, "xb") |
|
|
|
209 else: |
|
|
|
210 fd = open(cachefile, "wb") |
|
|
|
211 return fd |
|
|
|
212 |
|
|
|
213 def informserveradmin(uri, host=None, port=70): |
|
|
|
214 if host == None: |
|
|
|
215 (host, port, mtype, selector) = parseuri(uri) |
|
|
|
216 port = int(port) |
|
|
|
217 |
|
|
|
218 # We are nice and inform before every robots.txt, how to contact us. |
|
|
|
219 gopher(host=host, port=port, selector="This is eomyidae, your " |
|
|
|
220 "friendly crawler. See " |
|
|
|
221 "gopher://gopherproject.org/1/eomyidae for " |
|
|
|
222 "more info. Have a nice day!") |
|
|
|
223 |
|
|
|
224 def cacherobots(cachedir, uri, host=None, port=70, force=False, \ |
|
|
|
225 filtercache=None): |
|
|
|
226 if host == None: |
|
|
|
227 (host, port, mtype, selector) = parseuri(uri) |
|
|
|
228 port = int(port) |
|
|
|
229 |
|
|
|
230 if filtercache != None and host in filtercache: |
|
|
|
231 #print("Got filterlines from memory filtercache.") |
|
|
|
232 return filtercache[host] |
|
|
|
233 |
|
|
|
234 print("Getting robots for %s:%d" % (host, port)) |
|
|
|
235 |
|
|
|
236 cachepath = "%s/%s:%d" % (cachedir, host, port) |
|
|
|
237 mkpath(cachepath) |
|
|
|
238 |
|
|
|
239 cacherobotstxt = "%s/robots.txt" % (cachepath) |
|
|
|
240 cacherobotspickle = "%s/robots.pickle" % (cachepath) |
|
|
|
241 filterlines = {} |
|
|
|
242 if not os.path.exists(cacherobotstxt) or force == True: |
|
|
|
243 # Be nice. |
|
|
|
244 informserveradmin(uri=uri, host=host, port=port) |
|
|
|
245 |
|
|
|
246 robotsdata = gopher(host=host, port=port, selector="/robots.txt") |
|
|
|
247 print("Got new robots.txt.") |
|
|
|
248 print(robotsdata) |
|
|
|
249 robotstxtfd = mkopen(cacherobotstxt) |
|
|
|
250 robotstxtfd.write(robotsdata.encode()) |
|
|
|
251 robotstxtfd.close() |
|
|
|
252 |
|
|
|
253 filterlines = adaptrobots(robotsdata) |
|
|
|
254 # Do not store if there is nothing, so we save I/O later. |
|
|
|
255 if filterlines["empty"] == False: |
|
|
|
256 print("Storing filterlines.") |
|
|
|
257 storelistdb(cacherobotspickle, filterlines) |
|
|
|
258 |
|
|
|
259 else: |
|
|
|
260 if os.path.exists(cacherobotspickle): |
|
|
|
261 #print("Loading filterlines from cache.") |
|
|
|
262 filterlines = loadlistdb(cacherobotspickle) |
|
|
|
263 else: |
|
|
|
264 #print("No filterlines available in cache.") |
|
|
|
265 filterlines["empty"] = True |
|
|
|
266 |
|
|
|
267 #print(filterlines) |
|
|
|
268 if filtercache != None: |
|
|
|
269 filtercache[host] = filterlines |
|
|
|
270 |
|
|
|
271 return filterlines |
|
|
|
272 |
|
|
|
273 def selectorisallowed(filterlines, selector): |
|
|
|
274 if filterlines["empty"] == True: |
|
|
|
275 return True |
|
|
|
276 |
|
|
|
277 def robotsmatch(pattern, selector): |
|
|
|
278 #print("pattern = %s, selector = %s" % (pattern, selector)) |
|
|
|
279 if pattern == '*': |
|
|
|
280 #print("Just start match.") |
|
|
|
281 return True |
|
|
|
282 elif pattern[0] == '*': |
|
|
|
283 #print("Begins with star.") |
|
|
|
284 if pattern[-1] == '*': |
|
|
|
285 #print("Begins and ends with star.") |
|
|
|
286 if pattern[1:-1] in selector: |
|
|
|
287 #print("Matches.") |
|
|
|
288 return True |
|
|
|
289 else: |
|
|
|
290 return False |
|
|
|
291 else: |
|
|
|
292 return selector.endswith(pattern[1:]) |
|
|
|
293 elif pattern[-1] == '*': |
|
|
|
294 #print("Ends with star.") |
|
|
|
295 return selector.startswith(pattern[:-1]) |
|
|
|
296 else: |
|
|
|
297 return selector.startswith(pattern) |
|
|
|
298 |
|
|
|
299 isallowed = True |
|
|
|
300 for line in filterlines["disallow"]: |
|
|
|
301 # TODO: Should this be match everything? |
|
|
|
302 if len(line) == 0: |
|
|
|
303 continue |
|
|
|
304 if robotsmatch(line, selector) == True: |
|
|
|
305 #print("isallowed = False") |
|
|
|
306 isallowed = False |
|
|
|
307 for line in filterlines["allow"]: |
|
|
|
308 # TODO: Should this be match everything? |
|
|
|
309 if len(line) == 0: |
|
|
|
310 continue |
|
|
|
311 if robotsmatch(line, selector) == True: |
|
|
|
312 #print("isallowed = True") |
|
|
|
313 isallowed = True |
|
|
|
314 |
|
|
|
315 #print("isallowed = %d" % (isallowed)) |
|
|
|
316 return isallowed |
|
|
|
317 |
|
|
|
318 def loadselectorstxt(filename): |
|
|
|
319 selectors = [] |
|
|
|
320 |
|
|
|
321 if os.path.exists(filename): |
|
|
|
322 fd = open(filename, "r") |
|
|
|
323 for line in fd: |
|
|
|
324 fields = line.split("|") |
|
|
|
325 selectors.append(fields) |
|
|
|
326 fd.close() |
|
|
|
327 |
|
|
|
328 return selectors |
|
|
|
329 |
|
|
|
330 def loadlist(filename): |
|
|
|
331 listelems = [] |
|
|
|
332 |
|
|
|
333 if os.path.exists(filename): |
|
|
|
334 fd = open(filename, "r") |
|
|
|
335 for line in fd: |
|
|
|
336 line = line.strip() |
|
|
|
337 if len(line) == 0: |
|
|
|
338 continue |
|
|
|
339 if line[0] == "#": |
|
|
|
340 continue |
|
|
|
341 listelems.append(line) |
|
|
|
342 fd.close() |
|
|
|
343 |
|
|
|
344 return listelems |
|
|
|
345 |
|
|
|
346 def loadlistdb(filename): |
|
|
|
347 listelems = [] |
|
|
|
348 |
|
|
|
349 if os.path.exists(filename): |
|
|
|
350 fd = open(filename, "rb") |
|
|
|
351 try: |
|
|
|
352 listelems = pickle.load(fd) |
|
|
|
353 except EOFError: |
|
|
|
354 return [] |
|
|
|
355 fd.close() |
|
|
|
356 |
|
|
|
357 return listelems |
|
|
|
358 |
|
|
|
359 def storelistdb(filename, listelems): |
|
|
|
360 fd = mkopen(filename) |
|
|
|
361 pickle.dump(listelems, fd) |
|
|
|
362 fd.close() |
|
|
|
363 |
|
|
|
364 def storerawdata(cachedir, uri, data, host=None, port=70): |
|
|
|
365 if host == None: |
|
|
|
366 (host, port, mtype, selector) = parseuri(uri) |
|
|
|
367 port = int(port) |
|
|
|
368 |
|
|
|
369 cachepath = "%s/%s:%s" % (cachedir, host, port) |
|
|
|
370 mkpath(cachepath) |
|
|
|
371 |
|
|
|
372 m = hashlib.sha256() |
|
|
|
373 m.update(uri.encode()) |
|
|
|
374 urihash = m.hexdigest() |
|
|
|
375 |
|
|
|
376 cachepath = "%s/%s.menu" % (cachepath, urihash) |
|
|
|
377 fd = mkopen(cachepath) |
|
|
|
378 #print("Storing %s at %s" % (uri, cachepath)) |
|
|
|
379 fd.write(("%s\n" % (uri)).encode()) |
|
|
|
380 fd.write(data.encode()) |
|
|
|
381 fd.close() |
|
|
|
382 |
|
|
|
383 def usage(app): |
|
|
|
384 app = os.path.basename(app) |
|
|
|
385 print("usage: %s [-hor] [-b base] [-f blocklist] [-w n] [starturl]" % (app), file=sys.stderr) |
|
|
|
386 sys.exit(1) |
|
|
|
387 |
|
|
|
388 def main(args): |
|
|
|
389 try: |
|
|
|
390 opts, largs = getopt.getopt(args[1:], "hb:f:ow:r") |
|
|
|
391 except getopt.GetoptError as err: |
|
|
|
392 print(str(err)) |
|
|
|
393 usage(args[0]) |
|
|
|
394 |
|
|
|
395 blocklistfile = None |
|
|
|
396 blocklist = [] |
|
|
|
397 |
|
|
|
398 base = "." |
|
|
|
399 starturi = None |
|
|
|
400 workernum = 1 |
|
|
|
401 robotscache = {} |
|
|
|
402 forcehostscount = False |
|
|
|
403 for o, a in opts: |
|
|
|
404 if o == "-h": |
|
|
|
405 usage(args[0]) |
|
|
|
406 elif o == "-b": |
|
|
|
407 base = a |
|
|
|
408 elif o == "-f": |
|
|
|
409 blocklistfile = a |
|
|
|
410 blocklist = loadlist(blocklistfile) |
|
|
|
411 print("blocklist: %s" % (blocklist)) |
|
|
|
412 elif o == "-o": |
|
|
|
413 forcehostscount = True |
|
|
|
414 elif o == "-r": |
|
|
|
415 # Do not cache robots.txt in memory. |
|
|
|
416 robotscache = None |
|
|
|
417 elif o == "-w": |
|
|
|
418 try: |
|
|
|
419 workernum = int(a) |
|
|
|
420 except ValueError: |
|
|
|
421 workernum = 1 |
|
|
|
422 else: |
|
|
|
423 assert False, "unhandled option" |
|
|
|
424 |
|
|
|
425 os.chdir(base) |
|
|
|
426 cachedir = "%s/cache" % (base) |
|
|
|
427 |
|
|
|
428 if len(largs) > 0: |
|
|
|
429 starturi = largs[0] |
|
|
|
430 |
|
|
|
431 knownuris = loadlistdb("knownuris.pickle") |
|
|
|
432 if knownuris == []: |
|
|
|
433 knownuris = {} |
|
|
|
434 lastlenknownuris = len(knownuris) |
|
|
|
435 |
|
|
|
436 def isblocked(uri): |
|
|
|
437 for rule in blocklist: |
|
|
|
438 if uri.startswith(rule): |
|
|
|
439 return True |
|
|
|
440 return False |
|
|
|
441 |
|
|
|
442 def addhostscount(host): |
|
|
|
443 if host in hostscount: |
|
|
|
444 hostscount[host] += 1 |
|
|
|
445 else: |
|
|
|
446 hostscount[host] = 1 |
|
|
|
447 |
|
|
|
448 def subhostscount(host): |
|
|
|
449 if host in hostscount: |
|
|
|
450 hostscount[host] -= 1 |
|
|
|
451 if hostscount[host] <= 0: |
|
|
|
452 del hostscount[host] |
|
|
|
453 |
|
|
|
454 def addhostscache(uri, host=None, port=70, selector="/"): |
|
|
|
455 if uri != None and host == None: |
|
|
|
456 (host, port, mtype, selector) = parseuri(uri) |
|
|
|
457 port = int(port) |
|
|
|
458 else: |
|
|
|
459 try: |
|
|
|
460 port = int(port) |
|
|
|
461 except ValueError: |
|
|
|
462 return |
|
|
|
463 |
|
|
|
464 if uri in knownuris: |
|
|
|
465 print("ignored for queue: %s" % (uri)) |
|
|
|
466 return |
|
|
|
467 if host == "": |
|
|
|
468 print("ignored for queue: %s" % (uri)) |
|
|
|
469 return |
|
|
|
470 if isblocked(uri): |
|
|
|
471 print("blocked by filters: %s" % (uri)) |
|
|
|
472 return |
|
|
|
473 |
|
|
|
474 addhostscount(host) |
|
|
|
475 |
|
|
|
476 if not host in hostscache: |
|
|
|
477 hostscache[host] = {} |
|
|
|
478 if not "queue" in hostscache[host]: |
|
|
|
479 hostscache[host]["queue"] = {} |
|
|
|
480 |
|
|
|
481 filterrules = cacherobots(cachedir, uri, \ |
|
|
|
482 host=host, \ |
|
|
|
483 port=port, \ |
|
|
|
484 filtercache=robotscache) |
|
|
|
485 if selectorisallowed(filterrules, selector) == True: |
|
|
|
486 hostscache[host]["queue"][uri] = None |
|
|
|
487 print("pushed to queue: %s" % (uri)) |
|
|
|
488 else: |
|
|
|
489 pass |
|
|
|
490 print("blocked by robots: %s" % (uri)) |
|
|
|
491 |
|
|
|
492 def getqueuelen(): |
|
|
|
493 queuelen = 0 |
|
|
|
494 for host in hostscache: |
|
|
|
495 queuelen += len(hostscache[host]["queue"]) |
|
|
|
496 return queuelen |
|
|
|
497 |
|
|
|
498 hostscache = loadlistdb("hostscache.pickle") |
|
|
|
499 if hostscache == []: |
|
|
|
500 hostscache = {} |
|
|
|
501 hostscount = loadlistdb("hostscount.pickle") |
|
|
|
502 if hostscount == [] or forcehostscount == True: |
|
|
|
503 hostscount = {} |
|
|
|
504 for host in list(hostscache.keys()): |
|
|
|
505 print("host = %s, queuelen = %d" \ |
|
|
|
506 % (host, \ |
|
|
|
507 len(hostscache[host]["queue"]))) |
|
|
|
508 if len(hostscache[host]["queue"]) == 0: |
|
|
|
509 del hostscache[host] |
|
|
|
510 continue |
|
|
|
511 for uri in hostscache[host]["queue"]: |
|
|
|
512 (host, port, mtype, selector) = parseuri(uri) |
|
|
|
513 addhostscount(host) |
|
|
|
514 |
|
|
|
515 def storestate(): |
|
|
|
516 if blocklistfile != None: |
|
|
|
517 blocklist = loadlist(blocklistfile) |
|
|
|
518 if len(blocklist) > 0: |
|
|
|
519 print("blocklist: %s" % (blocklist)) |
|
|
|
520 print("################## Storing state to disc.") |
|
|
|
521 storelistdb("knownuris.pickle", knownuris) |
|
|
|
522 storelistdb("hostscache.pickle", hostscache) |
|
|
|
523 storelistdb("hostscount.pickle", hostscount) |
|
|
|
524 print("################## Storing state to disc done.") |
|
|
|
525 |
|
|
|
526 jobs = [] |
|
|
|
527 if starturi != None: |
|
|
|
528 #print("starturi = %s" % (starturi)) |
|
|
|
529 if not isblocked(starturi): |
|
|
|
530 (starthost, startport, startmtype, startselector) = parseuri(starturi) |
|
|
|
531 addhostscache(starturi, \ |
|
|
|
532 selector=startselector, \ |
|
|
|
533 host=starthost, \ |
|
|
|
534 port=startport) |
|
|
|
535 try: |
|
|
|
536 jobs.append([starturi, starthost, int(startport), startselector]) |
|
|
|
537 except ValueError: |
|
|
|
538 # Please fix your URI. |
|
|
|
539 pass |
|
|
|
540 |
|
|
|
541 # Store state keeper. |
|
|
|
542 startnow = datetime.now() |
|
|
|
543 storedelta = timedelta(seconds=10) # 30 seconds |
|
|
|
544 |
|
|
|
545 lastlenknownhosts = len(hostscache) |
|
|
|
546 lastlenuriqueue = getqueuelen() |
|
|
|
547 while lastlenuriqueue > 0: |
|
|
|
548 if len(jobs) < workernum: |
|
|
|
549 for host in list(hostscache.keys()): |
|
|
|
550 if len(hostscache[host]["queue"]) == 0: |
|
|
|
551 del hostscache[host] |
|
|
|
552 if host in hostscount: |
|
|
|
553 del hostscount[host] |
|
|
|
554 |
|
|
|
555 selhosts = sorted(hostscount.items(), \ |
|
|
|
556 key=operator.itemgetter(1))[:workernum*2] |
|
|
|
557 |
|
|
|
558 # Give hosts with many selectors more jobs. |
|
|
|
559 hostjobs = {} |
|
|
|
560 for selhost in selhosts: |
|
|
|
561 # 10 ** x |
|
|
|
562 hostjobs[selhost[0]] = \ |
|
|
|
563 math.floor(math.log10(selhost[1])) |
|
|
|
564 if hostjobs[selhost[0]] == 0: |
|
|
|
565 hostjobs[selhost[0]] = 1 |
|
|
|
566 print("Queue Status: %s" % (hostjobs)) |
|
|
|
567 |
|
|
|
568 for selhost in selhosts: |
|
|
|
569 selhost = selhost[0] |
|
|
|
570 seluris = hostscache[selhost]["queue"] |
|
|
|
571 while hostjobs[selhost] > 0: |
|
|
|
572 if len(seluris) == 0: |
|
|
|
573 break |
|
|
|
574 jobitem = seluris.popitem() |
|
|
|
575 if isblocked(jobitem[0]): |
|
|
|
576 continue |
|
|
|
577 (host, port, mtype, selector) = parseuri(jobitem[0]) |
|
|
|
578 job = [jobitem[0], host, port, selector] |
|
|
|
579 if job not in jobs: |
|
|
|
580 jobs.append([jobitem[0], host, port, selector]) |
|
|
|
581 hostjobs[selhost] -= 1 |
|
|
|
582 |
|
|
|
583 print("Getting %d jobs." % (len(jobs))) |
|
|
|
584 |
|
|
|
585 dataresults = [] |
|
|
|
586 with Pool(processes=workernum) as pool: |
|
|
|
587 dataresults = pool.map(poolgopher, jobs) |
|
|
|
588 #data = gopher(host=host, port=port, selector=selector) |
|
|
|
589 jobs = [] |
|
|
|
590 |
|
|
|
591 for dataresult in dataresults: |
|
|
|
592 (cururi, host, port, selector, data) = dataresult |
|
|
|
593 subhostscount(host) |
|
|
|
594 storerawdata(cachedir, cururi, data, host=host, port=port) |
|
|
|
595 menudata = parsemenu(data) |
|
|
|
596 #print(menudata) |
|
|
|
597 for mi in menudata: |
|
|
|
598 # Only menus so far. |
|
|
|
599 if mi[0] == "1": |
|
|
|
600 # Fix menu items with ports in hosts. |
|
|
|
601 if ":" in mi[3]: |
|
|
|
602 mi[3] = mi[3].split(":")[0] |
|
|
|
603 |
|
|
|
604 guri = "gopher://%s:%s/%s%s" % \ |
|
|
|
605 (mi[3], mi[4], mi[0], mi[2]) |
|
|
|
606 |
|
|
|
607 addhostscache(guri, host=mi[3], \ |
|
|
|
608 port=mi[4], \ |
|
|
|
609 selector=mi[2]) |
|
|
|
610 |
|
|
|
611 print("Uri %s done." % (cururi)) |
|
|
|
612 knownuris[cururi] = None |
|
|
|
613 |
|
|
|
614 lenuriqueue = getqueuelen() |
|
|
|
615 lenknownuris = len(knownuris) |
|
|
|
616 lenknownhosts = len(hostscache) |
|
|
|
617 print("> queue hosts = %d (%d) %s" % \ |
|
|
|
618 (lenknownhosts, lenknownhosts - |
|
|
|
619 lastlenknownhosts, hostscache.keys())) |
|
|
|
620 print("> uri queue len = %d (%d)" % \ |
|
|
|
621 (lenuriqueue, lenuriqueue - lastlenuriqueue)) |
|
|
|
622 print("> visited uris = %d (%d)" % \ |
|
|
|
623 (lenknownuris, lenknownuris - lastlenknownuris)) |
|
|
|
624 lastlenknownuris = lenknownuris |
|
|
|
625 lastlenuriqueue = lenuriqueue |
|
|
|
626 lastlenknownhosts = lenknownhosts |
|
|
|
627 |
|
|
|
628 # TODO: Remove after debugging |
|
|
|
629 nowdelta = datetime.now() - startnow |
|
|
|
630 if nowdelta >= storedelta: |
|
|
|
631 storestate() |
|
|
|
632 startnow = datetime.now() |
|
|
|
633 |
|
|
|
634 time.sleep(0.2) # don't be too harsh on servers |
|
|
|
635 |
|
|
|
636 #break #oneshot |
|
|
|
637 |
|
|
|
638 # Save at end of even single shot. |
|
|
|
639 storestate() |
|
|
|
640 |
|
|
|
641 return 0 |
|
|
|
642 |
|
|
|
643 if __name__ == "__main__": |
|
|
|
644 sys.exit(main(sys.argv)) |
|
|
|
645 |
|