calibre/recipes/al_monitor.recipe at 62118462b255de39b4131df2090bd04db191d4e9 · unix-krishna/calibre · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env  python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
http://www.al-monitor.com/
'''
import string
import inspect
import datetime
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup


class AlMonitor(BasicNewsRecipe):
    title = u'Al Monitor'
    __author__ = u'spswerling'
    description = 'a The Pulse of the Middle East'
    no_stylesheets = True
    encoding = 'utf-8'
    category = 'news'
    language = 'en'
    publication_type = 'newspaper'
    cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png'
    masthead_url = cover_img_url
    remove_empty_feeds = True

    # on kindle, images can make things kind of fat. Slim them down.
    recursions = 0
    compress_news_images = True
    compress_news_images_max_size = 7
    scale_news_images = (150, 200)  # (kindle touch: 600x800)
    useHighResImages = False
    oldest_article = 1.5
    max_articles_per_section = 15

    sections = [
        (u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'),
        (u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'),
        (u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'),
        (u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'),
        (u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'),
        (u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'),
        (u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'),
        (u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'),
        (u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'),
    ]

    # util for creating remove_tags and keep_tags style regex matchers
    def tag_matcher(elt, attr, rgx_str):
        return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)})

    remove_tags = [
        dict(attrs={'id': [
            'header',
            'pulsebanner',
            'relatedarticles',
            'sidecolumn',
            'disqus',
            'footer',
            'footer2',
            'footer3',
            'mobile-extras',
        ]}),
        tag_matcher('hr', 'id', 'spacer'),
        tag_matcher('a', 'title', 'print this article'),
        tag_matcher('div', 'class', 'extras'),
        tag_matcher('div', 'class', '^clear$'),
        tag_matcher('div', 'class', '^overlay$'),
        tag_matcher('div', 'class', 'shareTag'),
    ]

    articles = {}
    urls_done = []

    def parse_index(self):
        for section in self.sections:
            self.parse_section(section[0], section[1])
        ans = []
        for k in self.articles:
            ans.append((string.capwords(k), self.articles[k]))
        return ans

    def parse_section(self, section, url):

        self.articles[section] = []

        try:
            self._p('process section  ' + section + ', url: ' + url)
            soup = self.index_to_soup(url)
        except:
            self._p('Unable to spider section')
            return []

        self._p('Got section. Processing links.')

        for link in soup.findAll('a'):
            href = link.get('href')
            text = self.text(link)
            if text and ('pulse/originals' in href):
                self.process_link(section, link)

    def process_link(self, section, link):
        title = self.text(link)
        if len(title) > 120:
            title = title[0:120] + '...'
        href = link.get('href')
        if not href:
            self._p("BAD HREF: " + str(link))
            return
        self.queue_article_link(section, href, title)

    def queue_article_link(self, section, url, title):
        full_url = self.abs_url(url)
        if full_url in self.urls_done:
            self._p('Skip (already Qd): ' + ' - '.join([section, title, url]))
            return

        self._p('Q: ' + ' - '.join([section, title, url]))
        self.urls_done.append(full_url)
        if len(self.articles[section]) >= self.max_articles_per_section:
            return
        self.articles[section].append(
            dict(title=title,
                 url=full_url,
                 date='',
                 description='',
                 author='',
                 content=''))

    def preprocess_raw_html(self, raw_html, url):
        reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
        if reason_to_skip:
            self._p('Skipping article: ' + reason_to_skip + ', ' + url)
            # Next line will show up as an error in the logs, but ignore, see
            #   http://www.mobileread.com/forums/showthread.php?p=2931136
            return None
        else:
            return super(self.__class__, self).preprocess_raw_html(raw_html, url)

    def populate_article_metadata(self, article, soup, first):
        summary_node = soup.find('div', {'id': 'summary'})
        if summary_node:
            summary = self.text(summary_node)
            self._p('Summary: ' + summary)
            article.text_summary = summary
        else:
            self._p('No summary')

    def should_skip_article(self, soup):
        date = self.scrape_article_date(soup)
        if not date:
            return False

        age = (datetime.datetime.now() - date).days
        if (age > self.oldest_article):
            return "too old"
        return False

    def scrape_article_date(self, soup):
        for span in soup.findAll('span'):
            txt = self.text(span)
            rgx = re.compile('Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*')
            hit = rgx.match(txt)
            if hit:
                return self.date_from_string(txt)

        return None

    def date_from_string(self, datestring):
        try:
            # eg: Posted September 17, 2014
            dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y")
        except:
            dt = None

        if dt:
            self._p('From string "' + datestring + '", datetime: ' + str(dt))
        else:
            self._p('Could not get datetime from ' + datestring)

        return dt

    def abs_url(self, url):
        if 'www.al-monitor.com' in url:
            abs_url = url
        elif url[0] == '/':
            abs_url = 'http://www.al-monitor.com' + url
        else:
            self._p('Not sure how to make abs_url: ' + url)
            raise

        if '#' in abs_url:
            abs_url = ''.join(abs_url.split('#')[0:-1])

        return abs_url

    def text(self, n):
        return self.tag_to_string(n).strip()

    def _dbg_soup_node(self, node):
        s = '   cls: ' + str(node.get('class')).strip() + \
            '  id: ' + str(node.get('id')).strip() + \
            '  role: ' + str(node.get('role')).strip() + \
            ' txt: ' + self.text(node)
        return s

    def _p(self, msg):
        curframe = inspect.currentframe()
        calframe = inspect.getouterframes(curframe, 2)
        calname = calframe[1][3].upper()
        print('[' + calname + '] ' + msg[0:100])