|
|
feed.py - zs - Zeitungsschau rss to email converter |
|
|
 |
git clone git://r-36.net/zs (git://r-36.net) |
|
|
 |
Log |
|
|
 |
Files |
|
|
 |
Refs |
|
|
 |
README |
|
|
 |
LICENSE |
|
|
|
--- |
|
|
|
feed.py (10854B) |
|
|
|
--- |
|
|
|
1 # |
|
|
|
2 # See LICENSE for licensing details. |
|
|
|
3 # |
|
|
|
4 # Copy me if you can. |
|
|
|
5 # by 20h |
|
|
|
6 # |
|
|
|
7 |
|
|
|
8 import lxml |
|
|
|
9 import lxml.objectify |
|
|
|
10 import html |
|
|
|
11 from datetime import datetime |
|
|
|
12 import dateutil.parser |
|
|
|
13 from dateutil.tz import gettz |
|
|
|
14 import requests |
|
|
|
15 import hashlib |
|
|
|
16 import pytz |
|
|
|
17 import codecs |
|
|
|
18 import urllib.parse |
|
|
|
19 import socket |
|
|
|
20 import json |
|
|
|
21 import pytz |
|
|
|
22 |
|
|
|
23 def parseiso(dstr, now): |
|
|
|
24 def gettzinfo(zone, offset): |
|
|
|
25 try: |
|
|
|
26 return gettz(zone) |
|
|
|
27 except: |
|
|
|
28 return None |
|
|
|
29 |
|
|
|
30 try: |
|
|
|
31 return dateutil.parser.parse(str(dstr), default=now, |
|
|
|
32 tzinfos=gettzinfo) |
|
|
|
33 except: |
|
|
|
34 # Invalid time format. Could not be parsed. |
|
|
|
35 return now |
|
|
|
36 |
|
|
|
37 def removenamespaces(xml): |
|
|
|
38 for key in xml.nsmap: |
|
|
|
39 nsstr = u'{%s}' % (xml.nsmap[key]) |
|
|
|
40 nsl = len(nsstr) |
|
|
|
41 |
|
|
|
42 for elem in xml.getiterator(): |
|
|
|
43 if elem.tag.startswith(nsstr): |
|
|
|
44 elem.tag = elem.tag[nsl:] |
|
|
|
45 |
|
|
|
46 def parsexml(astr): |
|
|
|
47 xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8")) |
|
|
|
48 removenamespaces(xml) |
|
|
|
49 # Throw XML parsing errors so we can blame the feed authors. |
|
|
|
50 #print(lxml.objectify.dump(xml)) |
|
|
|
51 return xml |
|
|
|
52 |
|
|
|
53 def parsetwtxtfeed(astr, uri): |
|
|
|
54 feed = {} |
|
|
|
55 articles = [] |
|
|
|
56 now = datetime.now(pytz.utc) |
|
|
|
57 now = now.replace(hour=20, minute=20, second=20, microsecond=20) |
|
|
|
58 |
|
|
|
59 feed["title"] = uri |
|
|
|
60 feed["link"] = uri |
|
|
|
61 feed["updated"] = now |
|
|
|
62 |
|
|
|
63 lines = astr.split("\n"); |
|
|
|
64 for line in lines: |
|
|
|
65 # People already reinterpret the standard. :( |
|
|
|
66 if len(line) == 0: |
|
|
|
67 continue |
|
|
|
68 if line[0] == "#": |
|
|
|
69 continue |
|
|
|
70 |
|
|
|
71 createdtxt, ltext = line.split("\t", 1) |
|
|
|
72 created = parseiso(createdtxt, now) |
|
|
|
73 |
|
|
|
74 article = {} |
|
|
|
75 article["id"] = createdtxt |
|
|
|
76 article["title"] = ltext |
|
|
|
77 article["text"] = ltext |
|
|
|
78 article["uuid"] = createdtxt |
|
|
|
79 article["updated"] = created |
|
|
|
80 |
|
|
|
81 if article["updated"] == now: |
|
|
|
82 article["uuid"] = "" |
|
|
|
83 else: |
|
|
|
84 article["uuid"] = "%s" % (article["updated"]) |
|
|
|
85 |
|
|
|
86 articles.append(article) |
|
|
|
87 |
|
|
|
88 feed["articles"] = articles |
|
|
|
89 |
|
|
|
90 return feed |
|
|
|
91 |
|
|
|
92 def parsejsonfeed(astr): |
|
|
|
93 js = json.loads(astr) |
|
|
|
94 |
|
|
|
95 feed = {} |
|
|
|
96 articles = [] |
|
|
|
97 now = datetime.now(pytz.utc) |
|
|
|
98 now = now.replace(hour=20, minute=20, second=20, microsecond=20) |
|
|
|
99 |
|
|
|
100 if "title" in js: |
|
|
|
101 feed["title"] = js["title"] |
|
|
|
102 if "description" in js: |
|
|
|
103 feed["description"] = js["description"] |
|
|
|
104 if "home_page_url" in js: |
|
|
|
105 feed["link"] = js["home_page_url"] |
|
|
|
106 if "feed_url" in js: |
|
|
|
107 feed["link"] = js["feed_url"] |
|
|
|
108 if "author" in js: |
|
|
|
109 if "name" in js["author"]: |
|
|
|
110 feed["author"] = js["author"]["name"] |
|
|
|
111 feed["updated"] = now |
|
|
|
112 |
|
|
|
113 if "items" in js: |
|
|
|
114 for item in js["items"]: |
|
|
|
115 article = {} |
|
|
|
116 if "url" in item: |
|
|
|
117 article["file"] = item["url"] |
|
|
|
118 if "title" in item: |
|
|
|
119 article["title"] = item["title"] |
|
|
|
120 if "id" in item: |
|
|
|
121 article["id"] = item["id"] |
|
|
|
122 else: |
|
|
|
123 if "link" in article: |
|
|
|
124 article["id"] = article["link"] |
|
|
|
125 elif "file" in article: |
|
|
|
126 article["id"] = article["file"] |
|
|
|
127 else: |
|
|
|
128 article["id"] = article["text"][:30] |
|
|
|
129 |
|
|
|
130 if "summary" in item: |
|
|
|
131 article["text"] = html.unescape(item["summary"]) |
|
|
|
132 if "content_html" in item: |
|
|
|
133 article["text"] = html.unescape(item["content_html"]) |
|
|
|
134 if "content_text" in item: |
|
|
|
135 article["text"] = html.unescape(item["content_text"]) |
|
|
|
136 if "date_published" in item: |
|
|
|
137 article["updated"] = \ |
|
|
|
138 dateutil.parser.parse(item["date_published"]) |
|
|
|
139 else: |
|
|
|
140 article["updated"] = now |
|
|
|
141 |
|
|
|
142 if article["updated"] == now: |
|
|
|
143 article["uuid"] = "" |
|
|
|
144 else: |
|
|
|
145 article["uuid"] = "%s" % (article["updated"]) |
|
|
|
146 |
|
|
|
147 for e in ("id", "title", "file"): |
|
|
|
148 if e in article: |
|
|
|
149 article["uuid"] = "%s-%s" % \ |
|
|
|
150 (article["uuid"],\ |
|
|
|
151 article[e]) |
|
|
|
152 |
|
|
|
153 def mkuuid(s): |
|
|
|
154 return hashlib.sha256(str(s).\ |
|
|
|
155 encode("utf8")).hexdigest() |
|
|
|
156 if len(article["uuid"]) == 0: |
|
|
|
157 article["uuid"] = mkuuid(now) |
|
|
|
158 else: |
|
|
|
159 article["uuid"] = mkuuid(article["uuid"]) |
|
|
|
160 |
|
|
|
161 # sanity checks |
|
|
|
162 if "title" not in article and "text" not in article \ |
|
|
|
163 and "file" not in article: |
|
|
|
164 continue |
|
|
|
165 |
|
|
|
166 articles.append(article) |
|
|
|
167 |
|
|
|
168 feed["articles"] = articles |
|
|
|
169 |
|
|
|
170 return feed |
|
|
|
171 |
|
|
|
172 def parseatomfeed(astr): |
|
|
|
173 xml = parsexml(astr) |
|
|
|
174 if xml == None: |
|
|
|
175 return None |
|
|
|
176 |
|
|
|
177 feed = {} |
|
|
|
178 articles = [] |
|
|
|
179 isrss = False |
|
|
|
180 isrdf = False |
|
|
|
181 now = datetime.now(pytz.utc) |
|
|
|
182 now = now.replace(hour=20, minute=20, second=20, microsecond=20) |
|
|
|
183 |
|
|
|
184 if hasattr(xml, "channel"): |
|
|
|
185 if hasattr(xml, "item"): |
|
|
|
186 isrdf = True |
|
|
|
187 oxml = xml |
|
|
|
188 xml = xml.channel |
|
|
|
189 isrss = True |
|
|
|
190 |
|
|
|
191 feed["title"] = "" |
|
|
|
192 for e in ("title", "description"): |
|
|
|
193 if hasattr(xml, e): |
|
|
|
194 feed[e] = html.unescape(str(xml[e])) |
|
|
|
195 |
|
|
|
196 if hasattr(xml, "image") and hasattr(xml.image, "title"): |
|
|
|
197 if "title" not in feed: |
|
|
|
198 feed["title"] = html.unescape(str(xml.image.title)) |
|
|
|
199 |
|
|
|
200 if hasattr(xml, "updated"): |
|
|
|
201 feed["updated"] = parseiso(xml.updated, now) |
|
|
|
202 elif hasattr(xml, "pubDate"): |
|
|
|
203 feed["updated"] = parseiso(xml.pubDate, now) |
|
|
|
204 elif hasattr(xml, "lastBuildDate"): |
|
|
|
205 feed["updated"] = parseiso(xml.lastBuildDate, now) |
|
|
|
206 else: |
|
|
|
207 feed["updated"] = now |
|
|
|
208 |
|
|
|
209 if hasattr(xml, "link"): |
|
|
|
210 if "href" in xml.link.attrib: |
|
|
|
211 feed["link"] = str(xml.link.attrib["href"]) |
|
|
|
212 else: |
|
|
|
213 feed["link"] = str(xml.link) |
|
|
|
214 |
|
|
|
215 if hasattr(xml, "webmaster"): |
|
|
|
216 feed["email"] = html.unescape(str(xml.webmaster)) |
|
|
|
217 elif hasattr(xml, "owner") and hasattr(xml.owner, "email"): |
|
|
|
218 feed["email"] = html.unescape(str(xml.owner.email)) |
|
|
|
219 elif hasattr(xml, "author") and hasattr(xml.author, "email"): |
|
|
|
220 feed["email"] = html.unescape(str(xml.author.email)) |
|
|
|
221 elif hasattr(xml, "webMaster"): |
|
|
|
222 feed["email"] = html.unescape(str(xml.webMaster)) |
|
|
|
223 elif hasattr(xml, "managingeditor"): |
|
|
|
224 feed["email"] = html.unescape(str(xml.managingeditor)) |
|
|
|
225 elif hasattr(xml, "managingEditor"): |
|
|
|
226 feed["email"] = html.unescape(str(xml.managingEditor)) |
|
|
|
227 |
|
|
|
228 if hasattr(xml, "author"): |
|
|
|
229 if hasattr(xml.author, "name"): |
|
|
|
230 feed["author"] = html.unescape(str(xml.author.name)) |
|
|
|
231 else: |
|
|
|
232 feed["author"] = html.unescape(str(xml.author)) |
|
|
|
233 elif hasattr(xml, "creator"): |
|
|
|
234 feed["author"] = html.unescape(str(xml.creator)) |
|
|
|
235 |
|
|
|
236 entryname = "entry" |
|
|
|
237 if isrss == True or isrdf == True: |
|
|
|
238 entryname = "item" |
|
|
|
239 if isrdf == True: |
|
|
|
240 xml = oxml |
|
|
|
241 if hasattr(xml, entryname): |
|
|
|
242 for entry in xml[entryname][:]: |
|
|
|
243 article = {} |
|
|
|
244 # title |
|
|
|
245 if hasattr(entry, "title"): |
|
|
|
246 article["title"] = html.unescape(\ |
|
|
|
247 str(entry["title"])) |
|
|
|
248 |
|
|
|
249 # link |
|
|
|
250 if hasattr(entry, "link"): |
|
|
|
251 if "href" in entry.link.attrib: |
|
|
|
252 article["link"] = str(entry.link.attrib["href"]) |
|
|
|
253 else: |
|
|
|
254 article["link"] = str(entry.link) |
|
|
|
255 elif hasattr(entry, "source"): |
|
|
|
256 article["link"] = str(entry.source) |
|
|
|
257 |
|
|
|
258 # enclosure |
|
|
|
259 if hasattr(entry, "enclosure"): |
|
|
|
260 if "href" in entry.enclosure.attrib: |
|
|
|
261 article["file"] = \ |
|
|
|
262 str(entry.enclosure.attrib["href"]) |
|
|
|
263 elif "url" in entry.enclosure.attrib: |
|
|
|
264 article["file"] = \ |
|
|
|
265 str(entry.enclosure.attrib["url"]) |
|
|
|
266 else: |
|
|
|
267 article["file"] = str(entry.enclosure) |
|
|
|
268 |
|
|
|
269 if hasattr(entry, "group") and \ |
|
|
|
270 hasattr(entry.group, "content"): |
|
|
|
271 if "url" in entry.group.content: |
|
|
|
272 article["file"] = \ |
|
|
|
273 html.unescape(\ |
|
|
|
274 str(entry.group.content.\ |
|
|
|
275 attrib["file"])) |
|
|
|
276 |
|
|
|
277 # updated |
|
|
|
278 try: |
|
|
|
279 if hasattr(entry, "updated"): |
|
|
|
280 article["updated"] = parseiso(entry.updated,\ |
|
|
|
281 now) |
|
|
|
282 elif hasattr(entry, "temporary"): |
|
|
|
283 article["updated"] = now |
|
|
|
284 elif hasattr(entry, "pubDate"): |
|
|
|
285 article["updated"] = parseiso(entry.pubDate,\ |
|
|
|
286 now) |
|
|
|
287 elif hasattr(entry, "date"): |
|
|
|
288 article["updated"] = parseiso(entry.date, now) |
|
|
|
289 else: |
|
|
|
290 article["updated"] = now |
|
|
|
291 except TypeError: |
|
|
|
292 # There was some error in parseiso. |
|
|
|
293 article["updated"] = now |
|
|
|
294 |
|
|
|
295 # author |
|
|
|
296 if hasattr(entry, "author"): |
|
|
|
297 if hasattr(entry.author, "name"): |
|
|
|
298 article["author"] = html.unescape(\ |
|
|
|
299 str(entry.author.name)) |
|
|
|
300 else: |
|
|
|
301 article["author"] = html.unescape(\ |
|
|
|
302 str(entry.author)) |
|
|
|
303 elif hasattr(entry, "creator"): |
|
|
|
304 article["author"] = html.unescape(\ |
|
|
|
305 str(entry.creator)) |
|
|
|
306 |
|
|
|
307 # tags |
|
|
|
308 if hasattr(entry, "category"): |
|
|
|
309 article["tags"] = [] |
|
|
|
310 for cat in entry["category"][:]: |
|
|
|
311 article["tags"].append(\ |
|
|
|
312 html.unescape(\ |
|
|
|
313 str(cat))) |
|
|
|
314 |
|
|
|
315 # text |
|
|
|
316 # Don't unescape the text, it might contain HTML. |
|
|
|
317 if hasattr(entry, "encoded"): |
|
|
|
318 article["text"] = str(entry.encoded) |
|
|
|
319 elif hasattr(entry, "content"): |
|
|
|
320 article["text"] = str(entry.content) |
|
|
|
321 elif hasattr(entry, "summary"): |
|
|
|
322 article["text"] = str(entry.summary) |
|
|
|
323 elif hasattr(entry, "description"): |
|
|
|
324 article["text"] = str(entry.description) |
|
|
|
325 |
|
|
|
326 # id |
|
|
|
327 if hasattr(entry, "id"): |
|
|
|
328 article["id"] = str(entry["id"]) |
|
|
|
329 else: |
|
|
|
330 if "link" in article: |
|
|
|
331 article["id"] = article["link"] |
|
|
|
332 elif "file" in article: |
|
|
|
333 article["id"] = article["file"] |
|
|
|
334 else: |
|
|
|
335 article["id"] = article["text"][:30] |
|
|
|
336 |
|
|
|
337 if article["updated"] == now: |
|
|
|
338 article["uuid"] = "" |
|
|
|
339 else: |
|
|
|
340 article["uuid"] = "%s" % (article["updated"]) |
|
|
|
341 |
|
|
|
342 # Certain websites need exceptions due to their |
|
|
|
343 # »programmers« being stupid. |
|
|
|
344 if "link" in feed: |
|
|
|
345 if "youtube.com" in feed["link"]: |
|
|
|
346 article["uuid"] = "" |
|
|
|
347 |
|
|
|
348 for e in ("id", "title", "file"): |
|
|
|
349 if e in article: |
|
|
|
350 article["uuid"] = "%s-%s" % \ |
|
|
|
351 (article["uuid"],\ |
|
|
|
352 article[e]) |
|
|
|
353 |
|
|
|
354 def mkuuid(s): |
|
|
|
355 return hashlib.sha256(str(s).\ |
|
|
|
356 encode("utf8")).hexdigest() |
|
|
|
357 if len(article["uuid"]) == 0: |
|
|
|
358 article["uuid"] = mkuuid(now) |
|
|
|
359 else: |
|
|
|
360 article["uuid"] = mkuuid(article["uuid"]) |
|
|
|
361 |
|
|
|
362 # sanity checks |
|
|
|
363 if "title" not in article and "text" not in article \ |
|
|
|
364 and "file" not in article: |
|
|
|
365 continue |
|
|
|
366 |
|
|
|
367 articles.append(article) |
|
|
|
368 |
|
|
|
369 try: |
|
|
|
370 feed["articles"] = sorted(articles, key=lambda article: \ |
|
|
|
371 article["updated"]) |
|
|
|
372 except TypeError: |
|
|
|
373 for article in articles: |
|
|
|
374 print(article["updated"]) |
|
|
|
375 |
|
|
|
376 return feed |
|
|
|
377 |
|
|
|
378 def fetch(uri): |
|
|
|
379 ftype = "xml" |
|
|
|
380 if "file://" in uri: |
|
|
|
381 fd = codecs.open(uri[7:], "r", "utf-8") |
|
|
|
382 fval = fd.read().encode("utf-8") |
|
|
|
383 fd.close() |
|
|
|
384 rcode = 200 |
|
|
|
385 elif "gopher://" in uri: |
|
|
|
386 urls = urllib.parse.urlparse(uri, allow_fragments=False) |
|
|
|
387 if ":" in urls.netloc: |
|
|
|
388 (host, port) = urls.netloc.split(":") |
|
|
|
389 else: |
|
|
|
390 host = urls.netloc |
|
|
|
391 port = 70 |
|
|
|
392 if len(urls.path) > 2: |
|
|
|
393 if len(urls.query) > 0: |
|
|
|
394 selector = "%s?%s" % (urls.path[2:], urls.query) |
|
|
|
395 else: |
|
|
|
396 selector = urls.path[2:] |
|
|
|
397 else: |
|
|
|
398 selector = "" |
|
|
|
399 |
|
|
|
400 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
|
|
|
401 s.connect((host, port)) |
|
|
|
402 s.send(("%s\r\n" % (selector)).encode("utf-8")) |
|
|
|
403 fd = s.makefile("r") |
|
|
|
404 fval = fd.read().encode("utf-8") |
|
|
|
405 s.close() |
|
|
|
406 rcode = 200 |
|
|
|
407 else: |
|
|
|
408 fd = requests.get(uri, timeout=20,\ |
|
|
|
409 headers={"User-Agent": "Zeitungsschau/1.0"}) |
|
|
|
410 fval = fd.content |
|
|
|
411 rcode = fd.status_code |
|
|
|
412 |
|
|
|
413 if "Content-Type" in fd.headers: |
|
|
|
414 if "application/json" in fd.headers["Content-Type"]: |
|
|
|
415 ftype = "json" |
|
|
|
416 |
|
|
|
417 if ftype == "xml": |
|
|
|
418 suri = uri.lower().rsplit(".", 1) |
|
|
|
419 if len(suri) > 1: |
|
|
|
420 if suri[-1] == "json": |
|
|
|
421 ftype = "json" |
|
|
|
422 elif suri[-1] == "txt": |
|
|
|
423 ftype = "twtxt" |
|
|
|
424 |
|
|
|
425 if ftype == "xml": |
|
|
|
426 rval = (rcode, parseatomfeed(fval)) |
|
|
|
427 elif ftype == "twtxt": |
|
|
|
428 rval = (rcode, parsetwtxtfeed(fval.decode("utf-8"), uri)) |
|
|
|
429 else: |
|
|
|
430 rval = (rcode, parsejsonfeed(fval.decode("utf-8"))) |
|
|
|
431 |
|
|
|
432 if rval[1] != None: |
|
|
|
433 rval[1]["feeduri"] = uri |
|
|
|
434 |
|
|
|
435 return rval |
|
|
|
436 |
|