forked from kovidgoyal/calibre
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathal_monitor.recipe
More file actions
213 lines (182 loc) · 6.85 KB
/
Copy pathal_monitor.recipe
File metadata and controls
213 lines (182 loc) · 6.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
http://www.al-monitor.com/
'''
import string
import inspect
import datetime
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AlMonitor(BasicNewsRecipe):
title = u'Al Monitor'
__author__ = u'spswerling'
description = 'a The Pulse of the Middle East'
no_stylesheets = True
encoding = 'utf-8'
category = 'news'
language = 'en'
publication_type = 'newspaper'
cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png'
masthead_url = cover_img_url
remove_empty_feeds = True
# on kindle, images can make things kind of fat. Slim them down.
recursions = 0
compress_news_images = True
compress_news_images_max_size = 7
scale_news_images = (150, 200) # (kindle touch: 600x800)
useHighResImages = False
oldest_article = 1.5
max_articles_per_section = 15
sections = [
(u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'),
(u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'),
(u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'),
(u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'),
(u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'),
(u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'),
(u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'),
(u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'),
(u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'),
]
# util for creating remove_tags and keep_tags style regex matchers
def tag_matcher(elt, attr, rgx_str):
return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)})
remove_tags = [
dict(attrs={'id': [
'header',
'pulsebanner',
'relatedarticles',
'sidecolumn',
'disqus',
'footer',
'footer2',
'footer3',
'mobile-extras',
]}),
tag_matcher('hr', 'id', 'spacer'),
tag_matcher('a', 'title', 'print this article'),
tag_matcher('div', 'class', 'extras'),
tag_matcher('div', 'class', '^clear$'),
tag_matcher('div', 'class', '^overlay$'),
tag_matcher('div', 'class', 'shareTag'),
]
articles = {}
urls_done = []
def parse_index(self):
for section in self.sections:
self.parse_section(section[0], section[1])
ans = []
for k in self.articles:
ans.append((string.capwords(k), self.articles[k]))
return ans
def parse_section(self, section, url):
self.articles[section] = []
try:
self._p('process section ' + section + ', url: ' + url)
soup = self.index_to_soup(url)
except:
self._p('Unable to spider section')
return []
self._p('Got section. Processing links.')
for link in soup.findAll('a'):
href = link.get('href')
text = self.text(link)
if text and ('pulse/originals' in href):
self.process_link(section, link)
def process_link(self, section, link):
title = self.text(link)
if len(title) > 120:
title = title[0:120] + '...'
href = link.get('href')
if not href:
self._p("BAD HREF: " + str(link))
return
self.queue_article_link(section, href, title)
def queue_article_link(self, section, url, title):
full_url = self.abs_url(url)
if full_url in self.urls_done:
self._p('Skip (already Qd): ' + ' - '.join([section, title, url]))
return
self._p('Q: ' + ' - '.join([section, title, url]))
self.urls_done.append(full_url)
if len(self.articles[section]) >= self.max_articles_per_section:
return
self.articles[section].append(
dict(title=title,
url=full_url,
date='',
description='',
author='',
content=''))
def preprocess_raw_html(self, raw_html, url):
reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
if reason_to_skip:
self._p('Skipping article: ' + reason_to_skip + ', ' + url)
# Next line will show up as an error in the logs, but ignore, see
# http://www.mobileread.com/forums/showthread.php?p=2931136
return None
else:
return super(self.__class__, self).preprocess_raw_html(raw_html, url)
def populate_article_metadata(self, article, soup, first):
summary_node = soup.find('div', {'id': 'summary'})
if summary_node:
summary = self.text(summary_node)
self._p('Summary: ' + summary)
article.text_summary = summary
else:
self._p('No summary')
def should_skip_article(self, soup):
date = self.scrape_article_date(soup)
if not date:
return False
age = (datetime.datetime.now() - date).days
if (age > self.oldest_article):
return "too old"
return False
def scrape_article_date(self, soup):
for span in soup.findAll('span'):
txt = self.text(span)
rgx = re.compile('Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*')
hit = rgx.match(txt)
if hit:
return self.date_from_string(txt)
return None
def date_from_string(self, datestring):
try:
# eg: Posted September 17, 2014
dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y")
except:
dt = None
if dt:
self._p('From string "' + datestring + '", datetime: ' + str(dt))
else:
self._p('Could not get datetime from ' + datestring)
return dt
def abs_url(self, url):
if 'www.al-monitor.com' in url:
abs_url = url
elif url[0] == '/':
abs_url = 'http://www.al-monitor.com' + url
else:
self._p('Not sure how to make abs_url: ' + url)
raise
if '#' in abs_url:
abs_url = ''.join(abs_url.split('#')[0:-1])
return abs_url
def text(self, n):
return self.tag_to_string(n).strip()
def _dbg_soup_node(self, node):
s = ' cls: ' + str(node.get('class')).strip() + \
' id: ' + str(node.get('id')).strip() + \
' role: ' + str(node.get('role')).strip() + \
' txt: ' + self.text(node)
return s
def _p(self, msg):
curframe = inspect.currentframe()
calframe = inspect.getouterframes(curframe, 2)
calname = calframe[1][3].upper()
print('[' + calname + '] ' + msg[0:100])