Skip to content

Commit 32feb11

Browse files
committed
Added scoring to opengraph plugin, changed default options.
1 parent 773b56f commit 32feb11

3 files changed

Lines changed: 40 additions & 20 deletions

File tree

bin/resolveimg.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,16 @@
1515
opts.add_option("-r","--max-read", dest="max_read",help="Set the max read size")
1616
opts.add_option("-c","--chunk-size",dest="chunk_size",help="Chunk size to read on each pass")
1717
opts.add_option("-a","--read-all",dest="read_all",help="Read the entire image before checking size. Useful for some JPGs. Overrides --max-read")
18-
opts.add_option("--no-adblock", action="store_true",dest="use_adblock_filters",help="Do not use whitelist.txt or blacklist.txt adblock filters")
19-
opts.add_option("--no-ruleset", action="store_true",dest="use_js_ruleset",help="Do not use a custom ruleset for scoring.")
18+
opts.add_option("-b","--adblock", action="store_true",dest="use_adblock_filters",help="Use adblock filters.")
19+
opts.add_option("-s","--no-ruleset", action="store_true",dest="use_js_ruleset",help="Use a custom ruleset for scoring.")
2020
opts.add_option("--benchmark", action="store_true",dest="benchmark",help="Benchmark the total time it takes for the script to return an image")
21-
opts.add_option("-l","--load-images", action="store_true",dest="load_images",help="Load images")
21+
opts.add_option("-n","--no-load-images", action="store_true",dest="load_images",help="Do not load images")
2222
opts.add_option("-p","--parser", dest="parser",help="Choose a parser to use")
2323

2424
(options,args) = opts.parse_args()
2525

2626
kw_options = {}
27+
2728
if options.read_all:
2829
kw_options['read_all'] = True
2930
elif options.max_read:
@@ -32,17 +33,17 @@
3233
if options.chunk_size:
3334
kw_options['chunk_size'] = int(options.chunk_size)
3435

35-
if options.use_adblock_filters:
36-
kw_options['use_adblock_filters'] = False
37-
3836
if options.use_js_ruleset:
3937
kw_options['use_js_ruleset'] = False
4038

4139
if options.parser:
4240
kw_options['parser'] = options.parser
4341

42+
if options.load_images:
43+
kw_options['load_images'] = True
44+
45+
kw_options['use_adblock_filters'] = options.use_adblock_filters
4446
kw_options['debug'] = options.debug
45-
kw_options['load_images'] = options.load_images
4647

4748
try:
4849
url = args[0]

imageresolver/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ class WebpageResolver(object):
153153
def __init__(self,**kwargs):
154154
self.load_images = kwargs.get('load_images',True)
155155
self.use_js_ruleset = kwargs.get('use_js_ruleset',True)
156-
self.use_adblock_filters = kwargs.get('use_adblock_filters',True)
156+
self.use_adblock_filters = kwargs.get('use_adblock_filters',False)
157157
self.significant_surface = kwargs.get('significant_surface', 100*100)
158158

159159
cwd = os.path.dirname(__file__)

imageresolver/plugins/opengraph.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,48 @@
22
import os
33
import logging
44
from bs4 import BeautifulSoup
5+
from operator import itemgetter
56

67
class Plugin:
78
def get_image(self, url, soup):
89

9-
ogtags = [{'attribute':'property', 'name':'og:image', 'value':'content'},
10-
{'attribute':'rel', 'name':'image_src', 'value':'href'},
11-
{'attribute':'name', 'name':'twitter:image', 'value':'value'},
12-
{'attribute':'name', 'name':'twitter:image', 'value':'content'}]
10+
ogtags = [{'type':'facebook','attribute':'property', 'name':'og:image', 'value':'content'},
11+
{'type':'facebook','attribute':'rel', 'name':'image_src', 'value':'href'},
12+
{'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'value'},
13+
{'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'content'},
14+
{'type':'twitter','attribute':'property', 'name':'twitter:image', 'value':'content'},
15+
{'type':'image','attribute':'itemprop', 'name':'image', 'value':'content'}]
1316

1417
ogimages = []
1518

1619
for ogtag in ogtags:
1720
tags = soup.find_all('meta', {ogtag['attribute']:ogtag['name']})
1821
if tags != []:
19-
try:
20-
ogimages = ogimages + [image[ogtag['value']] for image in tags]
21-
except KeyError:
22-
pass
22+
for image in tags:
23+
try:
24+
ogimages = ogimages + [{'url':image[ogtag['value']], 'type':ogtag['type'], 'score':0} for image in tags]
25+
except KeyError as e:
26+
pass
2327

24-
if len(ogimages) >= 1:
25-
logger = logging.getLogger('ImageResolver')
26-
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
27-
return ogimages[0]
28+
ogimages_len = len(ogimages)
29+
30+
# if more than 1 image, score and return the best one
31+
if ogimages_len >= 1:
32+
if ogimages_len == 1:
33+
logger = logging.getLogger('ImageResolver')
34+
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
35+
return ogimages[0]['url']
36+
else:
37+
for image in ogimages:
38+
if re.search('(large|big)', image['url'], re.IGNORECASE):
39+
image['score'] += 1
40+
if image['type'] == 'twitter':
41+
image['score'] += 1
42+
43+
ogimages.sort(key=itemgetter('score'), reverse=True)
44+
45+
return ogimages[0]['url']
46+
2847

2948
return None
3049

0 commit comments

Comments
 (0)