Skip to content

Commit f8bbb23

Browse files
committed
Merge pull request #3 from ccarterdev/master
Updated Pull Request. Plugin system, several plugins, improved resolveimg.py bin, improved scoring ruleset.
2 parents f7727c7 + d2f88bb commit f8bbb23

13 files changed

Lines changed: 199 additions & 66 deletions

File tree

CHANGES.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,18 @@ v0.2.0, 2014-12-22 --
6262

6363
v0.2.1, 2014-12-22 --
6464
- fixed an oopsy in setup.py and some other minor tweaks to docs
65+
66+
v0.3, 2015-07-03 --
67+
- Added plugin support
68+
- Created plugin directory
69+
- Removed the ImgurPageResolver class, replaced with the imgur.py plugin.
70+
- Added instagram, flickr plugins
71+
- Changed WebResolver defaults, load_images and use_js_ruleset now default to true.
72+
- Added another rule to the js_ruleset
73+
- Added several command options to resolveimg.py to help with debugging and
74+
performance testing.
75+
- Fixed some bugs
76+
- changed data files installation directory after installing
77+
from setup.py
78+
- Added opengraph plugin
79+

README.rst

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ USAGE
1616
try:
1717
i = imageresolver.ImageResolver()
1818
i.register(imageresolver.FileExtensionResolver())
19-
i.register(imageresolver.ImgurPageResolver())
2019
i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml',blacklist='easylist.txt'))
2120
url = sys.argv[1]
2221

@@ -84,13 +83,6 @@ FileExtensionResolver() METHODS
8483

8584
Returns the url if the extention matches a possible image
8685

87-
ImgurPageResolver() METHODS
88-
---------------------------
89-
90-
**resolve** *(string url)*
91-
92-
Returns an Imgur image url if `url` matches the pattern of an Imgur page
93-
9486
WebpageResolver() METHODS
9587
-------------------------
9688

@@ -137,18 +129,6 @@ By default this exception is skipped and logged but can be enabled with "skip_fe
137129
TODO
138130
-----------------
139131

140-
Still missing the following resolvers:
141-
142-
* ImgurAlbumResolver()
143-
144-
* FlickrResolver()
145-
146-
* OpengraphResolver()
147-
148-
* InstagramResolver()
149-
150-
I have no plans to implement a 9gag resolver.
151-
152132
Need to implement better caching. Future plan is to include a configurable cache method so images seen across sessions can be cached for better performance
153133

154134

bin/resolveimg.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import sys
44
import imageresolver
55
import logging
6+
import time
67
from optparse import OptionParser
78

89
logger = logging.getLogger('ImageResolver')
@@ -14,10 +15,16 @@
1415
opts.add_option("-r","--max-read", dest="max_read",help="Set the max read size")
1516
opts.add_option("-c","--chunk-size",dest="chunk_size",help="Chunk size to read on each pass")
1617
opts.add_option("-a","--read-all",dest="read_all",help="Read the entire image before checking size. Useful for some JPGs. Overrides --max-read")
18+
opts.add_option("-b","--adblock", action="store_true",dest="use_adblock_filters",help="Use adblock filters.")
19+
opts.add_option("-s","--no-ruleset", action="store_true",dest="use_js_ruleset",help="Use a custom ruleset for scoring.")
20+
opts.add_option("--benchmark", action="store_true",dest="benchmark",help="Benchmark the total time it takes for the script to return an image")
21+
opts.add_option("-n","--no-load-images", action="store_true",dest="load_images",help="Do not load images")
22+
opts.add_option("-p","--parser", dest="parser",help="Choose a parser to use")
1723

1824
(options,args) = opts.parse_args()
1925

2026
kw_options = {}
27+
2128
if options.read_all:
2229
kw_options['read_all'] = True
2330
elif options.max_read:
@@ -26,6 +33,16 @@
2633
if options.chunk_size:
2734
kw_options['chunk_size'] = int(options.chunk_size)
2835

36+
if options.use_js_ruleset:
37+
kw_options['use_js_ruleset'] = False
38+
39+
if options.parser:
40+
kw_options['parser'] = options.parser
41+
42+
if options.load_images:
43+
kw_options['load_images'] = False
44+
45+
kw_options['use_adblock_filters'] = options.use_adblock_filters
2946
kw_options['debug'] = options.debug
3047

3148
try:
@@ -39,10 +56,16 @@
3956
print "URL required. Please use the url option or pass a url as the first argument"
4057
sys.exit(-1)
4158

59+
60+
if options.benchmark:
61+
t1 = time.time()
62+
4263
i = imageresolver.ImageResolver(**kw_options)
4364
i.register(imageresolver.FileExtensionResolver())
44-
i.register(imageresolver.ImgurPageResolver())
45-
i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml'))
65+
i.register(imageresolver.WebpageResolver(**kw_options))
4666

4767
print i.resolve(url)
4868

69+
if options.benchmark:
70+
print 'TOTAL TIME', time.time() - t1
71+

docs/README.txt

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ USAGE
1616
try:
1717
i = imageresolver.ImageResolver()
1818
i.register(imageresolver.FileExtensionResolver())
19-
i.register(imageresolver.ImgurPageResolver())
2019
i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml',blacklist='easylist.txt'))
2120
url = sys.argv[1]
2221

@@ -84,13 +83,6 @@ FileExtensionResolver() METHODS
8483

8584
Returns the url if the extention matches a possible image
8685

87-
ImgurPageResolver() METHODS
88-
---------------------------
89-
90-
**resolve** *(string url)*
91-
92-
Returns an Imgur image url if `url` matches the pattern of an Imgur page
93-
9486
WebpageResolver() METHODS
9587
-------------------------
9688

@@ -137,18 +129,6 @@ By default this exception is skipped and logged but can be enabled with "skip_fe
137129
TODO
138130
-----------------
139131

140-
Still missing the following resolvers:
141-
142-
* ImgurAlbumResolver()
143-
144-
* FlickrResolver()
145-
146-
* OpengraphResolver()
147-
148-
* InstagramResolver()
149-
150-
I have no plans to implement a 9gag resolver.
151-
152132
Need to implement better caching. Future plan is to include a configurable cache method so images seen across sessions can be cached for better performance
153133

154134

imageresolver/__init__.py

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -149,23 +149,11 @@ def resolve(self,url,**kwargs):
149149

150150
return None
151151

152-
class ImgurPageResolver(object):
153-
# works a little different than the JS version.
154-
# it should drop references to galleries and find the image
155-
# could be buggy!
156-
def resolve(self,url,**kwargs):
157-
logger.debug('Resolving using Imgur ' + str(url))
158-
parsed = urlparse(url)
159-
if re.search( 'imgur.com(:80)*', parsed.netloc) and os.path.basename(parsed.path):
160-
return 'http://i.imgur.com/' + os.path.basename(parsed.path) + '.jpg'
161-
162-
return None
163-
164152
class WebpageResolver(object):
165153
def __init__(self,**kwargs):
166-
self.load_images = kwargs.get('load_images',False)
167-
self.use_js_ruleset = kwargs.get('use_js_ruleset',False)
168-
self.use_adblock_filters = kwargs.get('use_adblock_filters',True)
154+
self.load_images = kwargs.get('load_images',True)
155+
self.use_js_ruleset = kwargs.get('use_js_ruleset',True)
156+
self.use_adblock_filters = kwargs.get('use_adblock_filters',False)
169157
self.significant_surface = kwargs.get('significant_surface', 100*100)
170158

171159
cwd = os.path.dirname(__file__)
@@ -218,6 +206,7 @@ def _score(self,image):
218206
{'pattern':'1x1','score':-1},
219207
{'pattern':'pixel','score':-1},
220208
{'pattern':'ads','score':-1},
209+
{'pattern':'transparent','score':-1}
221210
]
222211

223212
for r in rules:
@@ -254,12 +243,35 @@ def _score(self,image):
254243

255244
return score
256245

246+
def plugin_resolve(self,url,soup,**kwargs):
247+
plugins = {}
248+
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'plugins')
249+
sys.path.insert(0, path)
250+
for plugin_file in os.listdir(path):
251+
filename, extension = os.path.splitext(plugin_file)
252+
if extension == '.py' and filename != '__init__':
253+
mod = __import__(filename)
254+
plugins[filename] = mod.Plugin()
255+
sys.path.pop(0)
256+
257+
for plugin in plugins.values():
258+
image = plugin.get_image(url,soup)
259+
if image:
260+
return image
261+
return None
262+
263+
257264
def resolve(self,url,**kwargs):
258265
logger.debug('Resolving as a webpage ' + str(url))
259-
260266
ir = ImageResolver()
261267
content = ir.fetch(url)
262268
soup = BeautifulSoup(content,self.parser)
269+
270+
plugin_image = self.plugin_resolve(url,soup)
271+
272+
if plugin_image:
273+
return plugin_image
274+
263275
images = soup.find_all('img')
264276

265277
candidates = []

imageresolver/abpy

Submodule abpy updated from a8ff334 to 6177472

imageresolver/plugins/__init__.py

Whitespace-only changes.

imageresolver/plugins/flickr.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import re
2+
import os
3+
import requests
4+
import logging
5+
from bs4 import BeautifulSoup
6+
7+
class Plugin:
8+
def get_image(self, url, soup):
9+
if re.search('http(s*):\/\/www.flickr.com\/photos\/([^\/]*)\/([^\/]*)\/(.*)', url):
10+
logger = logging.getLogger('ImageResolver')
11+
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
12+
tag = soup.find('img', {'class':'main-photo'})
13+
if tag:
14+
return 'https:' + tag['src']
15+
return None

imageresolver/plugins/imgur.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import re
2+
import os
3+
import requests
4+
from bs4 import BeautifulSoup
5+
from urlparse import urlparse
6+
import logging
7+
8+
class Plugin:
9+
def get_image(self, url, soup):
10+
if re.search('http(s*):\/\/(i\.|m\.)*imgur.com\/(gallery\/){0,1}(.*)', url):
11+
logger = logging.getLogger('ImageResolver')
12+
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
13+
parsed = urlparse(url)
14+
15+
if parsed.path[1:8] == 'gallery':
16+
logger.debug('Detected imgur gallery.')
17+
tag = soup.find('div', {'id':'1','class':'album-image'})
18+
image = re.findall('i\.imgur.com\/.*\.\w+', str(tag))
19+
if len(image) >= 1:
20+
return 'http://' + image[0]
21+
22+
elif parsed.path[0:3] == '/a/':
23+
logger.debug('Detected imgur album.')
24+
tag = soup.find('meta',{'name':'twitter:image0:src'})
25+
if tag:
26+
return tag['content']
27+
28+
else:
29+
parsed = urlparse(url)
30+
if re.search('imgur.com(:80)*', parsed.netloc) and os.path.basename(parsed.path):
31+
return 'http://i.imgur.com/' + os.path.basename(parsed.path) + '.jpg'
32+
return None
33+

imageresolver/plugins/opengraph.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import re
2+
import os
3+
import logging
4+
from bs4 import BeautifulSoup
5+
from operator import itemgetter
6+
7+
class Plugin:
8+
def get_image(self, url, soup):
9+
10+
ogtags = [{'type':'facebook','attribute':'property', 'name':'og:image', 'value':'content'},
11+
{'type':'facebook','attribute':'rel', 'name':'image_src', 'value':'href'},
12+
{'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'value'},
13+
{'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'content'},
14+
{'type':'twitter','attribute':'property', 'name':'twitter:image', 'value':'content'},
15+
{'type':'image','attribute':'itemprop', 'name':'image', 'value':'content'}]
16+
17+
ogimages = []
18+
19+
for ogtag in ogtags:
20+
tags = soup.find_all('meta', {ogtag['attribute']:ogtag['name']})
21+
if tags != []:
22+
for image in tags:
23+
try:
24+
ogimages = ogimages + [{'url':image[ogtag['value']], 'type':ogtag['type'], 'score':0} for image in tags]
25+
except KeyError as e:
26+
pass
27+
28+
ogimages_len = len(ogimages)
29+
30+
# if more than 1 image, score and return the best one
31+
if ogimages_len >= 1:
32+
if ogimages_len == 1:
33+
logger = logging.getLogger('ImageResolver')
34+
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
35+
resolved_image = ogimages[0]['url']
36+
else:
37+
for image in ogimages:
38+
if re.search('(large|big)', image['url'], re.IGNORECASE):
39+
image['score'] += 1
40+
if image['type'] == 'twitter':
41+
image['score'] += 1
42+
43+
ogimages.sort(key=itemgetter('score'), reverse=True)
44+
resolved_image = ogimages[0]['url']
45+
46+
if not re.search('^https?:', resolved_image):
47+
if resolved_image.startswith('//'):
48+
return 'http:' + resolved_image
49+
else:
50+
return resolved_image
51+
52+
53+
return None
54+

0 commit comments

Comments
 (0)