forked from kovidgoyal/calibre
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathars_technica.recipe
More file actions
98 lines (88 loc) · 4.43 KB
/
Copy pathars_technica.recipe
File metadata and controls
98 lines (88 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
__license__ = 'GPL v3'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
arstechnica.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class ArsTechnica(BasicNewsRecipe):
title = u'Ars Technica'
language = 'en'
__author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks'
description = 'Ars Technica: Serving the technologist for 1.2 decades'
publisher = 'Conde Nast Publications'
category = 'news, IT, technology'
oldest_article = 5
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
remove_empty_feeds = True
publication_type = 'newsportal'
extra_css = '''
body {font-family: Arial,sans-serif}
.heading{font-family: "Times New Roman",serif}
.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
img{display: block}
.caption-text{font-size:small; font-style:italic}
.caption-byline{font-size:small; font-style:italic; font-weight:bold}
'''
conversion_options = {
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(attrs={'class': 'standalone'}), dict(attrs={'id': 'article-guts'})
]
remove_tags = [
dict(name=['object', 'link', 'embed', 'iframe', 'meta']), dict(attrs={'class': 'corner-info'}), dict(attrs={
'id': 'article-footer-wrap'}), dict(attrs={'class': 'article-expander'}), dict(name='nav', attrs={'class': 'subheading'})
]
remove_attributes = ['lang']
feeds = [
(u'Ars Features (All our long-form feature articles)', u'http://feeds.arstechnica.com/arstechnica/features'),
(u'Technology Lab (Information Technology)', u'http://feeds.arstechnica.com/arstechnica/technology-lab'),
(u'Gear & Gadgets', u'http://feeds.arstechnica.com/arstechnica/gadgets'),
(u'Ministry of Innovation (Business of Technology)', u'http://feeds.arstechnica.com/arstechnica/business'),
(u'Risk Assessment (Security & Hacktivism)', u'http://feeds.arstechnica.com/arstechnica/security'),
(u'Law & Disorder (Civilizations & Discontents)', u'http://feeds.arstechnica.com/arstechnica/tech-policy'),
(u'Infinite Loop (Apple Ecosystem)', u'http://feeds.arstechnica.com/arstechnica/apple'),
(u'Opposable Thumbs (Gaming & Entertainment)', u'http://feeds.arstechnica.com/arstechnica/gaming'),
(u'Scientific Method (Science & Exploration)', u'http://feeds.arstechnica.com/arstechnica/science'),
(u'Multiverse (Exploratoins & Meditations on Sci-Fi)', u'http://feeds.arstechnica.com/arstechnica/multiverse'),
(u'Cars Technica (All Things Automotive)', u'http://feeds.arstechnica.com/arstechnica/cars'),
(u'Staff Blogs (From the Minds of Ars)', u'http://feeds.arstechnica.com/arstechnica/staff-blogs')
]
def append_page(self, soup, appendtag, position):
pager = soup.find(attrs={'class': 'numbers'})
if pager:
nexttag = pager.find(attrs={'class': 'next'})
if nexttag:
nurl = nexttag.parent['href']
rawc = self.index_to_soup(nurl, True)
soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
texttag = soup2.find(attrs={'id': 'article-guts'})
newpos = len(texttag.contents)
self.append_page(soup2, texttag, newpos)
texttag.extract()
pager.extract()
appendtag.insert(position, texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if 'alt' not in item:
item['alt'] = 'image'
return soup
def preprocess_raw_html(self, raw, url):
return '<html><head>' + raw[raw.find('</head>'):]