calibre/recipes/ars_technica.recipe at 62118462b255de39b4131df2090bd04db191d4e9 · unix-krishna/calibre · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
__license__ = 'GPL v3'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
arstechnica.com
'''

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup


class ArsTechnica(BasicNewsRecipe):
    title = u'Ars Technica'
    language = 'en'
    __author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks'
    description = 'Ars Technica: Serving the technologist for 1.2 decades'
    publisher = 'Conde Nast Publications'
    category = 'news, IT, technology'
    oldest_article = 5
    max_articles_per_feed = 100
    no_stylesheets = True
    encoding = 'utf-8'
    use_embedded_content = False
    remove_empty_feeds = True
    publication_type = 'newsportal'
    extra_css             = '''
                            body {font-family: Arial,sans-serif}
                            .heading{font-family: "Times New Roman",serif}
                            .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
                            img{display: block}
                            .caption-text{font-size:small; font-style:italic}
                            .caption-byline{font-size:small; font-style:italic; font-weight:bold}
    '''

    conversion_options = {
        'comments': description, 'tags': category, 'language': language, 'publisher': publisher
    }

    keep_only_tags = [
        dict(attrs={'class': 'standalone'}), dict(attrs={'id': 'article-guts'})
    ]

    remove_tags = [
        dict(name=['object', 'link', 'embed', 'iframe', 'meta']), dict(attrs={'class': 'corner-info'}), dict(attrs={
            'id': 'article-footer-wrap'}), dict(attrs={'class': 'article-expander'}), dict(name='nav', attrs={'class': 'subheading'})
    ]
    remove_attributes = ['lang']

    feeds = [
        (u'Ars Features (All our long-form feature articles)', u'http://feeds.arstechnica.com/arstechnica/features'),
        (u'Technology Lab (Information Technology)', u'http://feeds.arstechnica.com/arstechnica/technology-lab'),
        (u'Gear & Gadgets', u'http://feeds.arstechnica.com/arstechnica/gadgets'),
        (u'Ministry of Innovation (Business of Technology)', u'http://feeds.arstechnica.com/arstechnica/business'),
        (u'Risk Assessment (Security & Hacktivism)', u'http://feeds.arstechnica.com/arstechnica/security'),
        (u'Law & Disorder (Civilizations & Discontents)', u'http://feeds.arstechnica.com/arstechnica/tech-policy'),
        (u'Infinite Loop (Apple Ecosystem)', u'http://feeds.arstechnica.com/arstechnica/apple'),
        (u'Opposable Thumbs (Gaming & Entertainment)', u'http://feeds.arstechnica.com/arstechnica/gaming'),
        (u'Scientific Method (Science & Exploration)', u'http://feeds.arstechnica.com/arstechnica/science'),
        (u'Multiverse (Exploratoins & Meditations on Sci-Fi)', u'http://feeds.arstechnica.com/arstechnica/multiverse'),
        (u'Cars Technica (All Things Automotive)', u'http://feeds.arstechnica.com/arstechnica/cars'),
        (u'Staff Blogs (From the Minds of Ars)', u'http://feeds.arstechnica.com/arstechnica/staff-blogs')
    ]

    def append_page(self, soup, appendtag, position):
        pager = soup.find(attrs={'class': 'numbers'})
        if pager:
            nexttag = pager.find(attrs={'class': 'next'})
            if nexttag:
                nurl = nexttag.parent['href']
                rawc = self.index_to_soup(nurl, True)
                soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
                texttag = soup2.find(attrs={'id': 'article-guts'})
                newpos = len(texttag.contents)
                self.append_page(soup2, texttag, newpos)
                texttag.extract()
                pager.extract()
                appendtag.insert(position, texttag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
                str = item.string
                item.replaceWith(str)
            else:
                if limg:
                    item.name = 'div'
                    item.attrs = []
                else:
                    str = self.tag_to_string(item)
                    item.replaceWith(str)
        for item in soup.findAll('img'):
            if 'alt' not in item:
                item['alt'] = 'image'
        return soup

    def preprocess_raw_html(self, raw, url):
        return '<html><head>' + raw[raw.find('</head>'):]