diff --git a/.gitignore b/.gitignore index 2a1aff013..f4340abf1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,14 +6,58 @@ settings/keys/* *.dot reports ENV -venv +.env .DS_Store build deploy/last-update logs/* -cache/* celerybeat.pid celerybeat-schedule .gitignore~ -static/scss/**/*.css.map -*.retry \ No newline at end of file +assets/* +*.ipynb +dump.rdb +Pipfile.lock + +*.css.map + +# Local dev settings — Ansible-generated or personal overrides, never commit +settings/local.py +settings/log_config.py +settings/prod.py +settings/aws.py + +# AWS credential symlinks / key files +awskeys.py +iam_keys +auth.json +*.pem +id_rsa* +*.der +my.cnf* +*credentials*.csv +*credentials*.json + +# Local scratch / personal directories +ryscratch/ +experimental/ +notebooks/ + +# IDE +.idea/ +*.komodoproject + +# Test drivers and binaries +test/chromedriver* +test/geckodriver* +test/selenium-server* +test/*.jar +test/*.zip + +# Misc local artifacts +venv/ +deploy/prod.wsgi +deploy/public_keys/ +test-data/ +test/campaign_starter.sql +test/*.log \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 000000000..a9f8d1be3 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.9.11 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 96d4df1d0..000000000 --- a/.travis.yml +++ /dev/null @@ -1,25 +0,0 @@ -language: python - -python: - - '2.7' - -services: - - redis-server - - mysql - -env: - global: - - DJANGO_SETTINGS_MODULE=regluit.settings.travis - - PYTHONPATH=/home/travis/build/EbookFoundation/ - -before_install: - - sudo mkdir /var/log/django - - sudo chmod 777 /var/log/django - - mkdir ~/build/EbookFoundation/regluit/settings/keys/ - - cp ~/build/EbookFoundation/regluit/settings/dummy/__init__.py ~/build/EbookFoundation/regluit/settings/keys/__init__.py - - openssl aes-256-cbc -K $encrypted_56eb2b7cc527_key -iv $encrypted_56eb2b7cc527_iv -in ~/build/EbookFoundation/regluit/test/travis-host.py.enc -out ~/build/EbookFoundation/regluit/settings/keys/host.py -d - -install: - - pip install -r requirements_versioned.pip - -script: django-admin test diff --git a/Pipfile b/Pipfile new file mode 100644 index 000000000..284c65673 --- /dev/null +++ b/Pipfile @@ -0,0 +1,124 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +celery = "==4.4.7" +django = "==1.11.29" +django-contrib-comments = "==2.0.0" +django-extensions = "==3.1.1" +django-registration = "==2.4.1" +social-auth-app-django = "==2.1.0" +django-tastypie = "==0.14.1" +django-el-pagination = "==3.2.4" +django-selectable = "==1.1.0" +django-ckeditor = "==5.6.1" +django-storages = "==1.5.2" +sorl-thumbnail = "==12.6.3" +django-mptt = "==0.8.6" +pyepub = "==0.5.0" +django-sass-processor = "==0.8.2" +mysqlclient = "==1.4.6" +mailchimp3 = "==3.0.14" +boto3 = "==1.17.91" +pymarc = "==4.2.1" +beautifulsoup4 = "==4.11.1" +gitberg = "==0.8.7" +risparser = "==0.4.3" +django-jsonfield = "==1.0.0" +mechanize = "==0.4.5" +stripe = "==2.76.0" +selenium = "==3.141.0" +requests-mock = "==1.8.0" +redis = "==3.5.3" +xhtml2pdf = "==0.2.15" +pillow = "==9.5.0" +pypdf = ">=5.0.0" +amqp = "==2.6.1" +appdirs = "==1.4.4" +arabic-reshaper = "==3.0.0" +asn1crypto = "==1.5.1" +billiard = "==3.6.4.0" +botocore = "==1.20.112" +cachetools = "==5.3.2" +cairocffi = "==0.8.0" +certifi = "==2023.11.17" +cffi = "==1.16.0" +chardet = "==5.2.0" +charset-normalizer = "==3.3.2" +click = "==8.1.7" +colorama = "==0.4.6" +confusable-homoglyphs = "==3.2.0" +cryptography = "==41.0.7" +cssselect2 = "==0.7.0" +defusedxml = "==0.8.0rc2" +distlib = "==0.3.8" +django-js-asset = "==1.2.3" +docopt = "==0.6.2" +filelock = "==3.13.1" +gitdb = "==4.0.11" +"github3.py" = "==4.0.1" +gitpython = "==3.1.40" +html5lib = "==1.1" +idna = "==3.6" +isodate = "==0.6.1" +jinja2 = "==3.1.2" +jmespath = "==0.10.0" +kombu = "==4.6.11" +lxml = "==4.9.4" +markupsafe = "==2.1.3" +mock = "==2.0.0" +oauthlib = "==3.2.2" +oscrypto = "==1.3.0" +packaging = "==23.2" +pbr = "==6.0.0" +platformdirs = "==4.1.0" +pluggy = "==1.3.0" +pycparser = "==2.21" +pyhanko = "==0.21.0" +pyhanko-certvalidator = "==0.26.3" +pyopenssl = "==23.3.0" +pyparsing = "==3.1.1" +pypng = "==0.20220715.0" +pyproject-api = "==1.6.1" +python-bidi = "==0.4.2" +python-dateutil = "==2.8.2" +python-mimeparse = "==1.6.0" +python3-openid = "==3.2.0" +pytz = "==2023.3.post1" +pyyaml = "==6.0.1" +qrcode = "==7.4.2" +rdflib = "==7.0.0" +reportlab = "==4.0.8" +requests = ">=2.31.0" +requests-oauthlib = "==1.3.1" +s3transfer = "==0.4.2" +semver = "==2.2.0" +sh = "==2.0.6" +six = "==1.16.0" +smmap = "==5.0.1" +social-auth-core = "==4.5.1" +soupsieve = "==2.5" +sparqlwrapper = "==2.0.0" +svglib = "==1.5.1" +tinycss2 = "==1.2.1" +tomli = "==2.0.1" +tox = "==4.11.4" +tqdm = "==4.66.1" +typing-extensions = "==4.9.0" +tzlocal = "==5.2" +uritemplate = "==4.1.1" +uritools = "==4.0.2" +urllib3 = "==1.26.18" +vine = "==1.3.0" +virtualenv = "==20.25.0" +webencodings = "==0.5.1" +wikipedia = "==1.4.0" +django-email-change = {editable = true, ref = "fb063296cbf4e4a6d8a93d34d98fe0c7739c2e0d", git = "git+https://github.com/eshellman/django-email-change.git"} +django-notification = {editable = true, ref = "1ad2be4adf3551a3471d923380368341452e178a", git = "git+https://github.com/eshellman/django-notification.git"} +pyjwt = {extras = ["crypto"], version = "==2.8.0"} +pyoai = {editable = true, ref = "5ff2f15e869869e70d8139e4c37b7832854d7049", git = "git+https://github.com/infrae/pyoai.git"} + +[requires] +python_version = "3.9" diff --git a/README.md b/README.md index 66fc4a291..73a2dca03 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ regluit - "The Unglue.it web application and website" ======= -This repo - https://github.com/EbookFoundation/regluit will be the place for collaborative development for Unglue.it. Add issues and submit pull requests here. As of January 19, 2017, https://github.com/Gluejar/regluit is still being used for production builds. +Another repo - https://github.com/EbookFoundation/regluit will eventually be the place for collaborative development for Unglue.it. Add issues and submit pull requests there. As of September 1, 2019, https://github.com/Gluejar/regluit is still being used for production builds. The first version of the unglue.it codebase was a services-oriented project named "unglu". We decided that "unglu" was too complicated, so we started over and named the new project "regluit". @@ -10,49 +10,24 @@ contains four main applications: `core`, `frontend`, `api` and `payment` that ca and configured on as many ec2 instances that are needed to support traffic. The partitioning between these modules is not as clean as would be ideal. `payment` is particularly messy because we had to retool it twice because we had to switch from Paypal to Amazon Payments to Stripe. -regluit was originally developed on Django 1.3 (python 2.7) and currently runs on Django 1.8. +regluit was originally developed on Django 1.3 (python 2.7) and currently runs on Django 1.11 Python 3.8). -Development (Vagrant + Virtualbox) -------- - -The recommended method for local development is to create a virtual machine with [Vagrant](https://www.vagrantup.com/) and [Virtualbox](https://www.virtualbox.org/wiki/Downloads). -With this method, the only requirements on the host machine are `virtualbox` and `vagrant`. -Vagrant will use the `ansible-local` provisioner, therefore installing python and ansible on the host machine is not necessary. - -__Instructions for Ubuntu 16:__ -1. Install virtualbox: `sudo apt-get install virtualbox` -2. Install vagrant: `sudo apt-get install vagrant` -3. Clone the `EbookFoundation/regluit` repository. -4. Navigate to the base directory of the cloned repo (where `Vagrantfile` is located). -5. Run `vagrant up` to create the VM, install dependencies, and start necessary services. - * Note: This step may take up to 15 minutes to complete. -6. Once the VM has been created, run `vagrant ssh` to log in to the virtual machine you just created. If provisioning was successful, you should see a success message upon login. - * If virtualenv doesn't activate upon login, you can do it manually by running `cd /opt/regluit && source venv/bin/activate` -7. Within the VM, run `./manage.py runserver 0.0.0.0:8000` to start the Django development server. -8. On your host machine, open your web browser of choice and navigate to `http://127.0.0.1:8000` - -__Instructions for other platforms (Windows/OSX):__ -* Steps are essentially the same, except for the installation of Vagrant and Virtualbox. Refer to each package's documentation for specific installation instructions. - -_NOTE:_ If running Windows on your host machine, ensure you are running `vagrant up` from an elevated command prompt, e.g. right click on Command Prompt -> Run As Administrator. - - -Development (Host Machine) +Develop ------- Here are some instructions for setting up regluit for development on -an Ubuntu system. If you are on OS X see notes below -to install python-setuptools in step 1: +an Ubuntu system. If you are on OS X see notes below. -1. Ensure MySQL and Redis are installed & running on your system. + +- Ensure MySQL 5.7 and Redis are installed & running on your system. 1. Create a MySQL database and user for unglueit. 1. `sudo apt-get upgrade gcc` -1. `sudo apt-get install python-setuptools git python-lxml build-essential libssl-dev libffi-dev python2.7-dev libxml2-dev libxslt-dev libmysqlclient-dev` +1. `sudo apt-get install python-setuptools git python-lxml build-essential libssl-dev libffi-dev python3.8-dev libxml2-dev libxslt-dev libmysqlclient-dev` 1. `sudo easy_install virtualenv virtualenvwrapper` 1. `git clone git@github.com:Gluejar/regluit.git` 1. `cd regluit` 1. `mkvirtualenv regluit` -1. `pip install -r requirements_versioned.pip` +1. `pip install -r requirements.txt` 1. `add2virtualenv ..` 1. `cp settings/dev.py settings/me.py` 1. `mkdir settings/keys/` @@ -63,8 +38,9 @@ to install python-setuptools in step 1: 1. `deactivate ; workon regluit` 1. `django-admin.py migrate --noinput` 1. `django-admin.py loaddata core/fixtures/initial_data.json core/fixtures/bookloader.json` populate database with test data to run properly. -1. `django-admin.py celeryd --loglevel=INFO` start the celery daemon to perform asynchronous tasks like adding related editions, and display logging information in the foreground. -1. `django-admin.py celerybeat -l INFO` to start the celerybeat daemon to handle scheduled tasks. +1. `redis-server` to start the task broker +1. `celery -A regluit worker --loglevel=INFO ` start the celery daemon to perform asynchronous tasks like adding related editions, and display logging information in the foreground. Add ` --logfile=logs/celery.log` if you want the logs to go into a log file. +1. `celery -A regluit beat --loglevel=INFO` to start the celerybeat daemon to handle scheduled tasks. 1. `django-admin.py runserver 0.0.0.0:8000` (you can change the port number from the default value of 8000) 1. make sure a [redis server](https://redis.io/topics/quickstart) is running 1. Point your browser to http://localhost:8000/ @@ -77,105 +53,33 @@ CSS development Production Deployment --------------------- -OBSOLETE -Below are the steps for getting regluit running on EC2 with Apache and mod_wsgi, and talking to an Amazon Relational Data Store instance. -Instructions for setting please are slightly different. - -1. create an ubuntu ec2 instance (e.g, go http://alestic.com/ to find various ubuntu images) -1. `sudo aptitude update` -1. `sudo aptitude upgrade` -1. `sudo aptitude install git-core apache libapache2-mod-wsgi mysql-client python-virtualenv python-mysqldb redis-server python-lxml postfix python-dev libmysqlclient-dev` -1. `sudo mkdir /opt/regluit` -1. `sudo chown ubuntu:ubuntu /opt/regluit` -1. `cd /opt` -1. `git config --global user.name "Raymond Yee"` -1. `git config --global user.email "rdhyee@gluejar.com"` -1. `ssh-keygen` -1. add `~/.ssh/id\_rsa.pub` as a deploy key on github https://github.com/Gluejar/regluit/admin/keys -1. `git clone git@github.com:Gluejar/regluit.git` -1. `cd /opt/regluit` -1. create an Amazon RDS instance -1. connect to it, e.g. `mysql -u root -h gluejardb.cboagmr25pjs.us-east-1.rds.amazonaws.com -p` -1. `CREATE DATABASE unglueit CHARSET utf8;` -1. `GRANT ALL ON unglueit.\* TO ‘unglueit’@’ip-10-244-250-168.ec2.internal’ IDENTIFIED BY 'unglueit' REQUIRE SSL;` -1. update settings/prod.py with database credentials -1. `virtualenv ENV` -1. `source ENV/bin/activate` -1. `pip install -r requirements_versioned.pip` -1. `echo "/opt/" > ENV/lib/python2.7/site-packages/regluit.pth` -1. `django-admin.py syncdb --migrate --settings regluit.settings.prod` -1. `sudo mkdir /var/www/static` -1. `sudo chown ubuntu:ubuntu /var/www/static` -1. `django-admin.py collectstatic --settings regluit.settings.prod` -1. `sudo ln -s /opt/regluit/deploy/regluit.conf /etc/apache2/sites-available/regluit` -1. `sudo a2ensite regluit` -1. `sudo a2enmod ssl rewrite` -1. `cd /home/ubuntu` -1. copy SSL server key to `/etc/ssl/private/server.key` -1. copy SSL certificate to `/etc/ssl/certs/server.crt` -1. `sudo /etc/init.d/apache2 restart` -1. `sudo adduser --no-create-home celery --disabled-password --disabled-login` (just enter return for all?) -1. `sudo cp deploy/celeryd /etc/init.d/celeryd` -1. `sudo chmod 755 /etc/init.d/celeryd` -1. `sudo cp deploy/celeryd.conf /etc/default/celeryd` -1. `sudo mkdir /var/log/celery` -1. `sudo mkdir /var/run/celery` -1. `sudo chown celery:celery /var/log/celery /var/run/celery` -1. `sudo /etc/init.d/celeryd start` -1. `sudo cp deploy/celerybeat /etc/init.d/celerybeat` -1. `sudo chmod 755 /etc/init.d/celerybeat` -1. `sudo cp deploy/celerybeat.conf /etc/default/celerybeat` -1. `sudo mkdir /var/log/celerybeat` -1. `sudo chown celery:celery /var/log/celerybeat` -1. `sudo /etc/init.d/celerybeat start` - -## setup to enable ckeditor to work properly - -1. `mkdir /var/www/static/media/` -1. `sudo chown ubuntu:www-data /var/www/static/media/` - - -Updating Production --------------------- - -1. Study the latest changes in the master branch, especially keep in mind how -it has [changed from what's in production](https://github.com/Gluejar/regluit/compare/production...master). -1. Update the production branch accordingly. If everything in `master` is ready to be moved into `production`, you can just merge `master` into `production`. Otherwise, you can grab specific parts. (How to do so is something that should probably be described in greater detail.) -1. Login to unglue.it and run [`/opt/regluit/deploy/update-prod`](https://github.com/Gluejar/regluit/blob/master/deploy/update-prod) - +See http://github.com/EbookFoundation/regluit-provisioning OS X Developer Notes ------------------- To run regluit on OS X you should have XCode installed -Install virtualenvwrapper according -to the process at http://blog.praveengollakota.com/47430655: +Install MySQL: + `brew install mysql@5.7` + `mysql_secure_installation` + `mysqld_safe --user=root -p` + -1. `sudo easy\_install pip` -1. `sudo pip install virtualenv` -1. `pip install virtualenvwrapper` +We use pyenv and pipenv to set up an environment. Edit or create .bashrc in ~ to enable virtualenvwrapper commands: -1. `mkdir ~/.virtualenvs` -1. Edit .bashrc to include the following lines: - - export WORKON_HOME=$HOME/.virtualenvs - source your_path_to_virtualenvwrapper.sh_here -In the above web site, the path to virtualenvwrapper.sh was -/Library/Frameworks/Python.framework/Versions/2.7/bin/virtualenvwrapper.sh -In Snow Leopard, this may be /usr/local/bin/virtualenvwrapper.sh +1. `pipenv install -r requirements.txt` +1. Edit .zshrc to include the following lines: -Configure Terminal to automatically notice this at startup: -Terminal –> Preferences –> Settings –> Shell -Click "run command"; add `source ~/.bashrc` + `eval "$(pyenv init -)"` + `export PATH=$PATH:/Applications/Postgres.app/Contents/Versions/10/bin` + `export PATH=$PATH:/usr/local/opt/mysql-client/bin:$PATH` + `export ANSIBLE_VAULT_PASSWORD_FILE=PATH_TO_VAULT_PASSWORD` -If you get 'EnvironmentError: mysql_config not found' -edit the line ~/.virtualenvs/regluit/build/MySQL-python/setup_posix.py -1. mysql_config.path = "mysql_config" -to be (using a path that exists on your system) -1. mysql_config.path = "/usr/local/mysql-5.5.20-osx10.6-x86_64/bin/mysql_config" +If you get `EnvironmentError: mysql_config not found` +you might need to set a path to mysqlconfig You may need to set utf8 in /etc/my.cnf collation-server = utf8_unicode_ci @@ -183,15 +87,6 @@ collation-server = utf8_unicode_ci init-connect='SET NAMES utf8' character-set-server = utf8 -Selenium Install ---------------- - -Download the selenium server: -http://selenium.googlecode.com/files/selenium-server-standalone-2.5.0.jar - -Start the selenium server: -'java -jar selenium-server-standalone-2.5.0.jar' - MARC Records ------------ @@ -232,7 +127,12 @@ MARC Records * if you have records with both DIRECT and UNGLUE links, you'll need two MARCRecord instances * if you have both kinds of link, put them in _separate_ records, as marc_format can only take one value +MySQL Migration +--------------- + +## 5.7 - 8.0 Notes -# vagrant / ansible +* Many migration blockers were removed by by dumping, then restoring the database. +* After that, RDS was able to migrate +* needed to create the unglueit user from the mysql client -[How to build machines using Vagrant/ansible](docs/vagrant_ansible.md) diff --git a/Vagrantfile b/Vagrantfile deleted file mode 100644 index 0ad9cdbc6..000000000 --- a/Vagrantfile +++ /dev/null @@ -1,56 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -# All Vagrant configuration is done below. The "2" in Vagrant.configure -# configures the configuration version (we support older styles for -# backwards compatibility). Please don't change it unless you know what -# you're doing. -Vagrant.configure("2") do |config| - # The most common configuration options are documented and commented below. - # For a complete reference, please see the online documentation at - # https://docs.vagrantup.com. - # Every Vagrant development environment requires a box. You can search for - # boxes at https://vagrantcloud.com/search. - config.vm.box = "ubuntu/xenial64" - - # Disable automatic box update checking. If you disable this, then - # boxes will only be checked for updates when the user runs - # `vagrant box outdated`. This is not recommended. - config.vm.box_check_update = false - - # Setup specific for local machine - config.vm.define "regluit-local", primary: true do |local| - # Create a private network - local.vm.network "private_network", type: "dhcp" - local.vm.hostname = "regluit-local" - - # VirtuaLBox provider settings for running locally with Oracle VirtualBox - # --uartmode1 disconnected is necessary to disable serial interface, which - # is known to cause issues with Ubuntu 16 VM's - local.vm.provider "virtualbox" do |vb| - vb.name = "regluit-local" - vb.memory = 1024 - vb.cpus = 2 - vb.customize [ "modifyvm", :id, "--uartmode1", "disconnected" ] - end - - end - - config.vm.synced_folder ".", "/vagrant", disabled: true - config.vm.synced_folder ".", "/opt/regluit" - - config.vm.network "forwarded_port", guest: 8000, host: 8000 - - # Provision node with Ansible running on the Vagrant host - # This requires you have Ansible installed locally - # Vagrant autogenerates an ansible inventory file to use - config.vm.provision "ansible_local" do |ansible| - ansible.playbook = "/opt/regluit/provisioning/setup-regluit.yml" - ansible.provisioning_path = "/opt/regluit" - ansible.verbose = true - ansible.install = true - end - - config.vm.post_up_message = "Successfully created regluit-local VM. Run 'vagrant ssh' to log in and start the development server." - -end diff --git a/__init__.py b/__init__.py index e69de29bb..09eec0846 100755 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1,8 @@ +from __future__ import absolute_import, unicode_literals + +# This will make sure the app is always imported when +# Django starts so that shared_task will use this app. +from .celery_module import app as celery_app + +__all__ = ('celery_app',) + diff --git a/api/onix.py b/api/onix.py index c2c3eed49..afa7308e0 100644 --- a/api/onix.py +++ b/api/onix.py @@ -1,81 +1,125 @@ import datetime -import pytz import re -from lxml import etree + +from bs4 import BeautifulSoup +import pytz + +from django.core.paginator import Paginator, InvalidPage + +from regluit.bisac import Bisac from regluit.core import models from regluit.core.cc import ccinfo -from regluit.bisac import Bisac from .crosswalks import relator_contrib, iso639 -feed_xml = """ - + +WORKS_PER_PAGE = 30 + +feed_header = """ + """ +feed_xml = feed_header + ''' +''' +soup = None bisac = Bisac() -def text_node(tag, text, attrib={}): - node = etree.Element(tag, attrib=attrib) - node.text = text +def text_node(tag, text, attrib=None): + node = soup.new_tag(tag) + if attrib: + node.attrs = attrib + node.string = text return node -def onix_feed(facet, max=None): - feed = etree.fromstring(feed_xml) - feed.append(header(facet)) +def sub_element(node, tag, attrib=None): + sub = soup.new_tag(tag) + if attrib: + sub.attrs = attrib + node.append(sub) + return sub + + +def onix_feed(facet, max=None, page_number=None): + global soup + if not soup: + soup = BeautifulSoup('', 'lxml') + + yield feed_header + str(header(facet)) works = facet.works[0:max] if max else facet.works + + if page_number is not None: + try: + p = Paginator(works, WORKS_PER_PAGE) + works = p.page(page_number) + except InvalidPage: + works = models.Work.objects.none() + for work in works: - editions = models.Edition.objects.filter(work=work,ebooks__isnull=False) - editions = facet.facet_object.filter_model("Edition",editions).distinct() + editions = models.Edition.objects.filter(work=work, ebooks__isnull=False) + editions = facet.facet_object.filter_model("Edition", editions).distinct() for edition in editions: edition_prod = product(edition, facet.facet_object) if edition_prod is not None: - feed.append(edition_prod) - return etree.tostring(feed, pretty_print=True) - + yield edition_prod + yield '' + def onix_feed_for_work(work): - feed = etree.fromstring(feed_xml) - feed.append(header(work)) - for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct(): + global soup + if not soup: + soup = BeautifulSoup('', 'lxml') + + feed = BeautifulSoup(feed_xml, 'xml') + feed.ONIXMessage.append(header(work)) + for edition in models.Edition.objects.filter(work=work, ebooks__isnull=False).distinct(): edition_prod = product(edition) if edition_prod is not None: - feed.append(product(edition)) - return etree.tostring(feed, pretty_print=True) - + feed.ONIXMessage.append(product(edition)) + return str(feed) + def header(facet=None): - header_node = etree.Element("Header") - sender_node = etree.Element("Sender") + header_node = soup.new_tag("Header") + sender_node = soup.new_tag("Sender") sender_node.append(text_node("SenderName", "unglue.it")) sender_node.append(text_node("EmailAddress", "unglueit@ebookfoundation.org")) header_node.append(sender_node) - header_node.append(text_node("SentDateTime", pytz.utc.localize(datetime.datetime.utcnow()).strftime('%Y%m%dT%H%M%SZ'))) + header_node.append(text_node( + "SentDateTime", + pytz.utc.localize(datetime.datetime.utcnow()).strftime('%Y%m%dT%H%M%SZ') + )) header_node.append(text_node("MessageNote", facet.title if facet else "Unglue.it Editions")) return header_node def product(edition, facet=None): - ebooks=facet.filter_model("Ebook",edition.ebooks.filter(active=True)) if facet else edition.ebooks.filter(active=True) - ebooks=ebooks.order_by('-created') - # Just because an edition satisfies 2 facets with multiple ebooks doesn't mean that there is a single ebook satisfies both facets + ebooks = facet.filter_model( + "Ebook", + edition.ebooks.filter(active=True) + ) if facet else edition.ebooks.filter(active=True) + ebooks = ebooks.order_by('-created') + # Just because an edition satisfies 2 facets with multiple ebooks doesn't mean that there + # is a single ebook satisfies both facets if not ebooks.exists(): return None - - work=edition.work - product_node = etree.Element("Product") - product_node.append(text_node("RecordReference", "it.unglue.work.%s.%s" % (work.id, edition.id))) - product_node.append(text_node("NotificationType", "03" )) # final - - ident_node = etree.SubElement(product_node, "ProductIdentifier") - ident_node.append(text_node("ProductIDType", "01" )) #proprietary - ident_node.append(text_node("IDTypeName", "unglue.it edition id" )) #proprietary - ident_node.append(text_node("IDValue", unicode(edition.id) )) - + + work = edition.work + product_node = soup.new_tag("Product") + product_node.append(text_node( + "RecordReference", "it.unglue.work.%s.%s" % (work.id, edition.id) + )) + product_node.append(text_node("NotificationType", "03")) # final + + ident_node = sub_element(product_node, "ProductIdentifier") + ident_node.append(text_node("ProductIDType", "01")) #proprietary + ident_node.append(text_node("IDTypeName", "unglue.it edition id")) #proprietary + ident_node.append(text_node("IDValue", str(edition.id))) + # wrong isbn better than no isbn isbn = edition.isbn_13 if edition.isbn_13 else edition.work.first_isbn_13() if isbn: - ident_node = etree.SubElement(product_node, "ProductIdentifier") - ident_node.append(text_node("ProductIDType", "03" )) #proprietary - ident_node.append(text_node("IDValue", isbn )) + ident_node = sub_element(product_node, "ProductIdentifier") + ident_node.append(text_node("ProductIDType", "03")) #proprietary + ident_node.append(text_node("IDValue", isbn)) # Descriptive Detail Block - descriptive_node = etree.SubElement(product_node, "DescriptiveDetail") - descriptive_node.append(text_node("ProductComposition", "00" )) # single item - descriptive_node.append(text_node("ProductForm", "ED" )) # download + descriptive_node = sub_element(product_node, "DescriptiveDetail") + descriptive_node.append(text_node("ProductComposition", "00")) # single item + descriptive_node.append(text_node("ProductForm", "ED")) # download ebook = None latest_ebooks = [] @@ -84,129 +128,131 @@ def product(edition, facet=None): if ebook.format not in ebook_formats: ebook_formats.append(ebook.format) latest_ebooks.append(ebook) - if ebook.format=='epub': - descriptive_node.append(text_node("ProductFormDetail", "E101" )) - elif ebook.format=='pdf': - descriptive_node.append(text_node("ProductFormDetail", "E107" )) - elif ebook.format=='mobi': - descriptive_node.append(text_node("ProductFormDetail", "E116" )) + if ebook.format == 'epub': + descriptive_node.append(text_node("ProductFormDetail", "E101")) + elif ebook.format == 'pdf': + descriptive_node.append(text_node("ProductFormDetail", "E107")) + elif ebook.format == 'mobi': + descriptive_node.append(text_node("ProductFormDetail", "E116")) if ebook.rights: - license_node = etree.SubElement(descriptive_node, "EpubLicense") - license_node.append(text_node("EpubLicenseName", ebook.rights )) - lic_expr_node = etree.SubElement(license_node, "EpubLicenseExpression") - lic_expr_node.append(text_node("EpubLicenseExpressionType", '01' )) #human readable - lic_expr_node.append(text_node("EpubLicenseExpressionLink", ccinfo(ebook.rights).url )) - - title_node = etree.SubElement(descriptive_node, "TitleDetail") - title_node.append(text_node("TitleType", '01' )) #distinctive title - title_el = etree.SubElement(title_node, "TitleElement") - title_el.append(text_node("TitleElementLevel", '01' )) - title_el.append(text_node("TitleText", edition.title )) + license_node = sub_element(descriptive_node, "EpubLicense") + license_node.append(text_node("EpubLicenseName", ebook.rights)) + lic_expr_node = sub_element(license_node, "EpubLicenseExpression") + lic_expr_node.append(text_node("EpubLicenseExpressionType", '01')) #human readable + lic_expr_node.append(text_node("EpubLicenseExpressionLink", ccinfo(ebook.rights).url)) + + title_node = sub_element(descriptive_node, "TitleDetail") + title_node.append(text_node("TitleType", '01')) #distinctive title + title_el = sub_element(title_node, "TitleElement") + title_el.append(text_node("TitleElementLevel", '01')) + title_el.append(text_node("TitleText", edition.title)) contrib_i = 0 for contrib in edition.relators.all(): - contrib_i+=1 - contrib_node = etree.SubElement(descriptive_node, "Contributor") - contrib_node.append(text_node("SequenceNumber", unicode(contrib_i ))) - contrib_node.append(text_node("ContributorRole", relator_contrib.get(contrib.relation.code,"") )) + contrib_i += 1 + contrib_node = sub_element(descriptive_node, "Contributor") + contrib_node.append(text_node("SequenceNumber", str(contrib_i))) + contrib_node.append(text_node("ContributorRole", + relator_contrib.get(contrib.relation.code, ""))) contrib_node.append(text_node("PersonName", contrib.author.name)) contrib_node.append(text_node("PersonNameInverted", contrib.author.last_name_first)) (lang, locale) = (edition.work.language, None) if '_' in lang: (lang, locale) = lang.split('_') - if len(lang)==2: + if len(lang) == 2: lang = iso639.get(lang, None) if lang: - lang_node = etree.SubElement(descriptive_node, "Language") + lang_node = sub_element(descriptive_node, "Language") lang_node.append(text_node("LanguageRole", "01")) lang_node.append(text_node("LanguageCode", lang)) if locale: lang_node.append(text_node("CountryCode", locale)) for subject in work.subjects.all(): - subj_node = etree.SubElement(descriptive_node, "Subject") + subj_node = sub_element(descriptive_node, "Subject") if subject.authority == 'lcsh': subj_node.append(text_node("SubjectSchemeIdentifier", "04")) - subj_node.append(text_node("SubjectHeadingText", subject.name)) + subj_node.append(text_node("SubjectHeadingText", subject.name)) elif subject.authority == 'lcc': subj_node.append(text_node("SubjectSchemeIdentifier", "03")) subj_node.append(text_node("SubjectCode", subject.name)) - elif subject.authority == 'bisacsh': + elif subject.authority == 'bisacsh': subj_node.append(text_node("SubjectSchemeIdentifier", "10")) subj_node.append(text_node("SubjectCode", bisac.code(subject.name))) - subj_node.append(text_node("SubjectHeadingText", subject.name)) + subj_node.append(text_node("SubjectHeadingText", subject.name)) else: subj_node.append(text_node("SubjectSchemeIdentifier", "20")) - subj_node.append(text_node("SubjectHeadingText", subject.name)) + subj_node.append(text_node("SubjectHeadingText", subject.name)) # audience range composite if work.age_level: range_match = re.search(r'(\d?\d?)-(\d?\d?)', work.age_level) if range_match: - audience_range_node = etree.SubElement(descriptive_node, "AudienceRange") - audience_range_node.append(text_node("AudienceRangeQualifier", "17")) #Interest age, years + audience_range_node = sub_element(descriptive_node, "AudienceRange") + #Interest age, years + audience_range_node.append(text_node("AudienceRangeQualifier", "17")) if range_match.group(1): audience_range_node.append(text_node("AudienceRangePrecision", "03")) #from - audience_range_node.append(text_node("AudienceRangeValue", range_match.group(1))) + audience_range_node.append(text_node("AudienceRangeValue", range_match.group(1))) if range_match.group(2): audience_range_node.append(text_node("AudienceRangePrecision", "04")) #from - audience_range_node.append(text_node("AudienceRangeValue", range_match.group(2))) - + audience_range_node.append(text_node("AudienceRangeValue", range_match.group(2))) + # Collateral Detail Block - coll_node = etree.SubElement(product_node, "CollateralDetail") - desc_node = etree.SubElement(coll_node, "TextContent") + coll_node = sub_element(product_node, "CollateralDetail") + desc_node = sub_element(coll_node, "TextContent") desc_node.append(text_node("TextType", '03')) # description desc_node.append(text_node("ContentAudience", '00')) #unrestricted - desc = (work.description if work.description else '') + '

Listed by Unglue.it.' % work.id - try : - content = etree.XML("
" + desc + "
") - content_node = etree.SubElement(desc_node, "Text", attrib={"textformat":"05"}) #xhtml - content_node.append(content) - except etree.XMLSyntaxError: - content_node = etree.SubElement(desc_node, "Text", attrib={"textformat":"02"}) #html - content_node.text = etree.CDATA(desc) - supp_node = etree.SubElement(coll_node, "SupportingResource") + desc = (work.description if work.description else '') + \ + '

Listed by Unglue.it.' % work.id + content = BeautifulSoup('
' + desc + '
', 'lxml') + content_node = sub_element(desc_node, "Text", attrib={"textformat":"05"}) #xhtml + content_node.append(content.body.div) + supp_node = sub_element(coll_node, "SupportingResource") supp_node.append(text_node("ResourceContentType", '01')) #front cover supp_node.append(text_node("ContentAudience", '00')) #unrestricted supp_node.append(text_node("ResourceMode", '03')) #image - cover_node = etree.SubElement(supp_node, "ResourceVersion") + cover_node = sub_element(supp_node, "ResourceVersion") cover_node.append(text_node("ResourceForm", '01')) #linkable - coverfeat_node = etree.SubElement(cover_node, "ResourceVersionFeature") + coverfeat_node = sub_element(cover_node, "ResourceVersionFeature") coverfeat_node.append(text_node("ResourceVersionFeatureType", '01')) #image format coverfeat_node.append(text_node("FeatureValue", 'D502')) #jpeg cover_node.append(text_node("ResourceLink", edition.cover_image_thumbnail())) #link # Publishing Detail Block - pubdetail_node = etree.SubElement(product_node, "PublishingDetail") + pubdetail_node = sub_element(product_node, "PublishingDetail") if edition.publisher_name: - pub_node = etree.SubElement(pubdetail_node, "Publisher") + pub_node = sub_element(pubdetail_node, "Publisher") pub_node.append(text_node("PublishingRole", '01')) #publisher pub_node.append(text_node("PublisherName", edition.publisher_name.name)) pubdetail_node.append(text_node("PublishingStatus", '00')) #unspecified - + #consumers really want a pub date - publication_date = edition.publication_date if edition.publication_date else edition.work.earliest_publication_date + publication_date = edition.publication_date if edition.publication_date else \ + edition.work.earliest_publication_date if publication_date: - pubdate_node = etree.SubElement(pubdetail_node, "PublishingDate") + pubdate_node = sub_element(pubdetail_node, "PublishingDate") pubdate_node.append(text_node("PublishingDateRole", '01')) #nominal pub date - pubdate_node.append(text_node("Date", publication_date.replace('-',''))) - + pubdate_node.append(text_node("Date", publication_date.replace('-', ''))) + # Product Supply Block - supply_node = etree.SubElement(product_node,"ProductSupply") - market_node = etree.SubElement(supply_node,"Market") - terr_node = etree.SubElement(market_node,"Territory") + supply_node = sub_element(product_node, "ProductSupply") + market_node = sub_element(supply_node, "Market") + terr_node = sub_element(market_node, "Territory") terr_node.append(text_node("RegionsIncluded", 'WORLD')) - supply_detail_node = etree.SubElement(supply_node,"SupplyDetail") - supplier_node = etree.SubElement(supply_detail_node,"Supplier") + supply_detail_node = sub_element(supply_node, "SupplyDetail") + supplier_node = sub_element(supply_detail_node, "Supplier") supplier_node.append(text_node("SupplierRole", '11')) #non-exclusive distributer supplier_node.append(text_node("SupplierName", 'Unglue.it')) #non-exclusive distributer for ebook in latest_ebooks: - website_node = etree.SubElement(supplier_node,"Website") + website_node = sub_element(supplier_node, "Website") website_node.append(text_node("WebsiteRole", '29')) #full content - website_node.append(text_node("WebsiteDescription", '%s file download' % ebook.format, attrib={'textformat':'06'})) #full content + #full content + website_node.append(text_node("WebsiteDescription", + '%s file download' % ebook.format, + attrib={'textformat':'06'})) website_node.append(text_node("WebsiteLink", ebook.download_url)) #full content supply_detail_node.append(text_node("ProductAvailability", '20')) #Available - price_node = etree.SubElement(supply_detail_node,"Price") + price_node = sub_element(supply_detail_node, "Price") price_node.append(text_node("PriceType", '01')) #retail excluding tax price_node.append(text_node("PriceAmount", '0.00')) #retail excluding tax price_node.append(text_node("CurrencyCode", 'USD')) #retail excluding tax return product_node - \ No newline at end of file diff --git a/api/opds.py b/api/opds.py index 907f0efd5..7b514b561 100644 --- a/api/opds.py +++ b/api/opds.py @@ -1,32 +1,36 @@ +import datetime from itertools import islice +import logging +from urllib.parse import urlparse, urlunparse -from lxml import etree -import datetime -import urlparse -from django.core.urlresolvers import reverse +from bs4 import BeautifulSoup +import pytz + +from django.core.cache import cache +from django.urls import reverse from django.utils.http import urlquote -import pytz -import logging -logger = logging.getLogger(__name__) from regluit.core import models, facets import regluit.core.cc as cc licenses = cc.LICENSE_LIST +logger = logging.getLogger(__name__) +soup = None FORMAT_TO_MIMETYPE = {'pdf':"application/pdf", 'epub':"application/epub+zip", 'mobi':"application/x-mobipocket-ebook", 'html':"text/html", 'text':"text/html"} -UNGLUEIT_URL= 'https://unglue.it' -ACQUISITION = "application/atom+xml;profile=opds-catalog;kind=acquisition" +UNGLUEIT_URL = 'https://unglue.it' +ACQUISITION = "application/atom+xml; profile=opds-catalog ;kind=acquisition; charset=utf-8" +NAVIGATION = "application/atom+xml; profile=opds-catalog; kind=navigation; charset=utf-8" FACET_RELATION = "http://opds-spec.org/facet" -old_facets= ["creative_commons","active_campaigns"] +old_facets = ["creative_commons", "active_campaigns"] def feeds(): @@ -40,50 +44,56 @@ def feeds(): def get_facet_class(name): if name in old_facets: return globals()[name] - else: - return get_facet_facet(name) - - + return get_facet_facet(name) + + def text_node(tag, text): - node = etree.Element(tag) - node.text = text + node = soup.new_tag(tag) + if text: + node.string = text return node def html_node(tag, html): node = text_node(tag, html) - node.attrib.update({"{http://www.w3.org/2005/Atom}type":'html'}) + node.attrs.update({"type":'html'}) return node - + def add_query_component(url, qc): """ add component qc to the querystring of url """ - m = list(urlparse.urlparse(url)) - if len(m[4]): - m[4] = "&".join([m[4],qc]) + m = list(urlparse(url)) + if m[4]: + m[4] = "&".join([m[4], qc]) else: m[4] = qc - return urlparse.urlunparse(m) + return urlunparse(m) def isbn_node(isbn): - node = etree.Element("{http://purl.org/dc/terms/}identifier") - node.attrib.update({"{http://www.w3.org/2001/XMLSchema-instance}type":'dcterms:URI'}) - node.text = 'urn:ISBN:'+ isbn + node = soup.new_tag("dcterms:identifier") + node.attrs.update({"xsi:type":'dcterms:URI'}) + node.string = 'urn:ISBN:'+ isbn return node def work_node(work, facet=None): - - node = etree.Element("entry") + + node = soup.new_tag("entry") # title node.append(text_node("title", work.title)) - + # id - node.append(text_node('id', "{base}{url}".format(base=UNGLUEIT_URL,url=reverse('work_identifier',kwargs={'work_id':work.id})))) - + node.append(text_node( + 'id', + "{base}{url}".format( + base=UNGLUEIT_URL, + url=reverse('work_identifier', kwargs={'work_id': work.id}) + ) + )) + updated = None - + # links for all ebooks - ebooks = facet.filter_model("Ebook",work.ebooks()) if facet else work.ebooks() + ebooks = facet.filter_model("Ebook", work.ebooks()) if facet else work.ebooks() versions = set() for ebook in ebooks: if updated is None: @@ -92,78 +102,85 @@ def work_node(work, facet=None): node.append(text_node('updated', updated)) if not ebook.version_label in versions: versions.add(ebook.version_label) - link_node = etree.Element("link") - + link_node = soup.new_tag("link") + # ebook.download_url is an absolute URL with the protocol, domain, and path baked in - link_rel = "http://opds-spec.org/acquisition/open-access" - link_node.attrib.update({"href":add_query_component(ebook.download_url, "feed=opds"), - "rel":link_rel, - "{http://purl.org/dc/terms/}rights": str(ebook.rights)}) - if ebook.is_direct(): - link_node.attrib["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "") + link_rel = "http://opds-spec.org/acquisition/open-access" + link_node.attrs.update({ + "href":add_query_component(ebook.download_url, "feed=opds"), + "rel":link_rel, + "dcterms:rights": str(ebook.rights) + }) + if ebook.is_direct(): + link_node["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "") else: - """ indirect acquisition, i.e. google books """ - link_node.attrib["type"] = "text/html" - indirect = etree.Element("{http://opds-spec.org/}indirectAcquisition",) - indirect.attrib["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "") + # indirect acquisition, i.e. google books + link_node["type"] = "text/html" + indirect = soup.new_tag("opds:indirectAcquisition",) + indirect["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "") link_node.append(indirect) if ebook.version_label: - link_node.attrib.update({"{http://schema.org/}version": ebook.version_label}) + link_node.attrs.update({"schema:version": ebook.version_label}) node.append(link_node) - + # get the cover -- assume jpg? - - cover_node = etree.Element("link") - cover_node.attrib.update({"href":work.cover_image_small(), - "type":"image/"+work.cover_filetype(), - "rel":"http://opds-spec.org/image/thumbnail"}) + + cover_node = soup.new_tag("link") + cover_node.attrs.update({ + "href": work.cover_image_small(), + "type": "image/" + work.cover_filetype(), + "rel": "http://opds-spec.org/image/thumbnail" + }) node.append(cover_node) - cover_node = etree.Element("link") - cover_node.attrib.update({"href":work.cover_image_thumbnail(), - "type":"image/"+work.cover_filetype(), - "rel":"http://opds-spec.org/image"}) + cover_node = soup.new_tag("link") + cover_node.attrs.update({ + "href": work.cover_image_thumbnail(), + "type": "image/" + work.cover_filetype(), + "rel": "http://opds-spec.org/image" + }) node.append(cover_node) - - + + # 2012 - node.append(text_node("{http://purl.org/dc/terms/}issued", work.publication_date)) - + node.append(text_node("dcterms:issued", work.publication_date)) + # author # TO DO: include all authors? - author_node = etree.Element("author") + author_node = soup.new_tag("author") author_node.append(text_node("name", work.author())) node.append(author_node) - + # publisher #Open Book Publishers - if len(work.publishers()): + if work.publishers().exists(): for publisher in work.publishers(): - node.append(text_node("{http://purl.org/dc/terms/}publisher", publisher.name.name)) - + node.append(text_node("dcterms:publisher", publisher.name.name)) + # language #en - node.append(text_node("{http://purl.org/dc/terms/}language", work.language)) - + node.append(text_node("dcterms:language", work.language)) + # description - node.append(html_node("{http://www.w3.org/2005/Atom}content", work.description)) - + node.append(html_node("content", work.description)) + # identifiers if work.identifiers.filter(type='isbn'): for isbn in work.identifiers.filter(type='isbn')[0:9]: #10 should be more than enough node.append(isbn_node(isbn.value)) - + # subject tags # [[subject.name for subject in work.subjects.all()] for work in ccworks if work.subjects.all()] for subject in work.subjects.all(): if subject.is_visible: - category_node = etree.Element("category") + category_node = soup.new_tag("category") try: - category_node.attrib["term"] = subject.name + category_node["term"] = subject.name node.append(category_node) try: subject.works.filter(is_free=True)[1] # only show feed if there's another work in it - append_navlink(node, 'related', 'kw.'+ subject.name , 0, 'popular', title=subject.name) + node.append(navlink('related', 'kw.' + subject.name, 0, + 'popular', title=subject.name)) except: pass except ValueError: @@ -172,48 +189,53 @@ def work_node(work, facet=None): subject.delete() # age level - # + # if work.age_level: - category_node = etree.Element("category") - category_node.attrib["scheme"] = 'http://schema.org/typicalAgeRange' - category_node.attrib["term"] = work.age_level - category_node.attrib["label"] = work.get_age_level_display() + category_node = soup.new_tag("category") + category_node["scheme"] = 'http://schema.org/typicalAgeRange' + category_node["term"] = work.age_level + category_node["label"] = work.get_age_level_display() node.append(category_node) - - - # rating - rating_node = etree.Element("{http://schema.org/}Rating") - rating_node.attrib.update({"{http://schema.org/}ratingValue":"{:}".format(work.priority())}) + + + # rating + rating_node = soup.new_tag("schema:Rating") + rating_node.attrs.update({"schema:ratingValue":"{:}".format(work.priority())}) node.append(rating_node) return node class Facet: title = '' - works = None + works = models.Work.objects.none() feed_path = '' description = '' - + def feed(self, page=None, order_by='newest'): self.works = self.works.order_by(*facets.get_order_by(order_by)) return opds_feed_for_works(self, page=page, order_by=order_by) - + def updated(self): # return the creation date for most recently added item - if not self.works: + key = f"{self.feed_path.replace(' ', '_')}_updated" + if not self.works.exists(): return pytz.utc.localize(datetime.datetime.utcnow()).isoformat() - else: - return pytz.utc.localize(self.works[0].created).isoformat() + value = cache.get(key) + if value is None: + value = pytz.utc.localize(self.works.latest('created').created).isoformat() + cache.set(key, value, 100000) + return value def get_facet_facet(facet_path): class Facet_Facet(Facet): - + def __init__(self, facet_path=facet_path): self.feed_path = facet_path self.facet_object = facets.get_facet_object(facet_path) self.title = "Unglue.it" for facet in self.facet_object.facets(): self.title = self.title + " " + facet.title - self.works = self.facet_object.get_query_set().distinct() + self.works = self.facet_object.get_query_set() self.description = self.facet_object.description return Facet_Facet @@ -221,11 +243,14 @@ class creative_commons(Facet): def __init__(self): self.title = "Unglue.it Catalog: Creative Commons Books" self.feed_path = "creative_commons" - self.works = models.Work.objects.filter(editions__ebooks__isnull=False, - editions__ebooks__rights__in=cc.LICENSE_LIST).distinct() - self.description= "These Creative Commons licensed ebooks are free to read - the people who created them want you to read and share them." + self.works = models.Work.objects.filter( + editions__ebooks__isnull=False, + editions__ebooks__rights__in=cc.LICENSE_LIST + ) + self.description = """These Creative Commons licensed ebooks are free to read - the people + who created them want you to read and share them.""" self.facet_object = facets.get_facet_object(self.feed_path) - + class active_campaigns(Facet): """ return opds feed for works associated with active campaigns @@ -233,115 +258,136 @@ class active_campaigns(Facet): def __init__(self): self.title = "Unglue.it Catalog: Books under Active Campaign" self.feed_path = "active_campaigns" - self.works = models.Work.objects.filter(campaigns__status='ACTIVE', is_free = True) - self.description= "With your help we're raising money to make these books free to the world." + self.works = models.Work.objects.filter(campaigns__status='ACTIVE', is_free=True) + self.description = """With your help we're raising money + to make these books free to the world.""" self.facet_object = facets.get_facet_object(self.feed_path) def opds_feed_for_work(work_id): class single_work_facet: def __init__(self, work_id): try: - works=models.Work.objects.filter(id=work_id) + works = models.Work.objects.filter(id=work_id) except models.Work.DoesNotExist: - works=models.Work.objects.none() + works = models.Work.objects.none() except ValueError: # not a valid work_id - works=models.Work.objects.none() - self.works=works - self.title='Unglue.it work #%s' % work_id - self.feed_path='' - self.facet_object= facets.BaseFacet(None) - return opds_feed_for_works( single_work_facet(work_id) ) + works = models.Work.objects.none() + self.works = works + self.title = 'Unglue.it work #%s' % work_id + self.feed_path = '' + self.facet_object = facets.BaseFacet(None) + return opds_feed_for_works(single_work_facet(work_id)) def opds_feed_for_works(the_facet, page=None, order_by='newest'): - works = the_facet.works + global soup + if not soup: + soup = BeautifulSoup('', 'lxml') + works = the_facet.works.distinct() feed_path = the_facet.feed_path title = the_facet.title - feed_xml = """ + """ - - feed = etree.fromstring(feed_xml) - + xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd + http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd"> + """ + + yield feed_header + # add title # TO DO: will need to calculate the number items and where in the feed we are - - feed.append(text_node('title', title + ' - sorted by ' + order_by)) - - # id - - feed.append(text_node('id', "{url}/api/opds/{feed_path}/?order_by={order_by}".format(url=UNGLUEIT_URL, - feed_path=urlquote(feed_path), order_by=order_by))) - + + yield text_node('title', title + ' - sorted by ' + order_by).prettify() + + # id + + feed = text_node( + 'id', + "{url}/api/opds/{feed_path}/?order_by={order_by}".format( + url=UNGLUEIT_URL, + feed_path=urlquote(feed_path), + order_by=order_by, + ), + ) + yield feed.prettify() + # updated # TO DO: fix time zone? # also use our wrapped datetime code - - feed.append(text_node('updated', - pytz.utc.localize(datetime.datetime.utcnow()).isoformat())) - + + feed = text_node('updated', pytz.utc.localize(datetime.datetime.utcnow()).isoformat()) + yield feed.prettify() + # author - - author_node = etree.Element("author") + + author_node = soup.new_tag("author") author_node.append(text_node('name', 'unglue.it')) author_node.append(text_node('uri', UNGLUEIT_URL)) - feed.append(author_node) - + yield author_node.prettify() + # links: start, self, next/prev (depending what's necessary -- to start with put all CC books) - + # start link - append_navlink(feed, 'start', feed_path, None , order_by, title="First 10") - + yield navlink('start', feed_path, None, order_by, title="First 10").prettify() + # next link - + if not page: - page =0 + page = 0 else: try: - page=int(page) + page = int(page) except TypeError: - page=0 - + page = 0 + try: works[10 * page + 10] - append_navlink(feed, 'next', feed_path, page+1 , order_by, title="Next 10") + yield navlink('next', feed_path, page+1, order_by, title="Next 10").prettify() except IndexError: pass - + # sort facets - append_navlink(feed, FACET_RELATION, feed_path, None, 'popular', group="Order", active = order_by=='popular', title="Sorted by popularity") - append_navlink(feed, FACET_RELATION, feed_path, None, 'newest', group="Order", active = order_by=='newest', title="Sorted by newest") - + yield navlink(FACET_RELATION, feed_path, None, 'popular', group="Order", + active=order_by == 'popular', title="Sorted by popularity").prettify() + yield navlink(FACET_RELATION, feed_path, None, 'newest', group="Order", + active=order_by == 'newest', title="Sorted by newest").prettify() + #other facets if feed_path not in old_facets: for other_group in the_facet.facet_object.get_other_groups(): for facet_object in other_group.get_facets(): - append_navlink(feed, FACET_RELATION, feed_path + '/' + facet_object.facet_name, None, order_by, group=other_group.title, title=facet_object.title) - - works = islice(works, 10 * page, 10 * page + 10) + yield navlink(FACET_RELATION, feed_path + '/' + facet_object.facet_name, + None, order_by, group=other_group.title, + title=facet_object.title).prettify() + + works = islice(works, 10 * page, 10 * page + 10) if page > 0: - append_navlink(feed, 'previous', feed_path, page-1, order_by, title="Previous 10") + yield navlink('previous', feed_path, page-1, order_by, title="Previous 10").prettify() + for work in works: - node = work_node(work, facet=the_facet.facet_object) - feed.append(node) - - return etree.tostring(feed, pretty_print=True) - -def append_navlink(feed, rel, path, page, order_by, group=None, active=None , title=""): - link = etree.Element("link") - link.attrib.update({"rel":rel, - "href": UNGLUEIT_URL + "/api/opds/" + urlquote(path) + '/?order_by=' + order_by + ('&page=' + unicode(page) if page!=None else ''), - "type": ACQUISITION, - "title": title, - }) + yield work_node(work, facet=the_facet.facet_object).prettify() + + yield ''' +''' + +def navlink(rel, path, page, order_by, group=None, active=None, title=""): + link = soup.new_tag("link") + link.attrs.update({ + "rel":rel, + "href": UNGLUEIT_URL + "/api/opds/" + urlquote(path) + '/?order_by=' + order_by + ( + '&page=' + str(page) if page is not None else '' + ), + "type": ACQUISITION, + "title": title, + }) if rel == FACET_RELATION: if group: - link.attrib['{http://opds-spec.org/}facetGroup'] = group + link['opds:facetGroup'] = group if active: - link.attrib['{http://opds-spec.org/}activeFacet'] = 'true' - feed.append(link) \ No newline at end of file + link['opds:activeFacet'] = 'true' + return link diff --git a/api/opds_json.py b/api/opds_json.py index 59e218029..4007b99ae 100644 --- a/api/opds_json.py +++ b/api/opds_json.py @@ -1,26 +1,22 @@ -from itertools import islice - import datetime -import urlparse -from django.core.urlresolvers import reverse -from django.utils.http import urlquote +from itertools import islice +import logging import json + import pytz -import logging -logger = logging.getLogger(__name__) +from django.urls import reverse +from django.utils.http import urlquote from regluit.core import models, facets import regluit.core.cc as cc + from .opds import ( - feeds, - get_facet_class, add_query_component, - Facet, - get_facet_facet, - opds_feed_for_work, ) +logger = logging.getLogger(__name__) + licenses = cc.LICENSE_LIST FORMAT_TO_MIMETYPE = {'pdf':"application/pdf", @@ -29,7 +25,7 @@ 'html':"text/html", 'text':"text/html"} -UNGLUEIT_URL= 'https://unglue.it' +UNGLUEIT_URL = 'https://unglue.it' ACQUISITION = "application/opds+json" FACET_RELATION = "opds:facet" JSONCONTEXT = "http://opds-spec.org/opds.jsonld" @@ -42,24 +38,22 @@ def feeds(): def get_facet_class(name): return get_facet_facet(name) - + def text_node(tag, text): return {tag:text} def html_node(tag, html): return {tag:html} - + def isbn_node(isbn): return 'urn:ISBN:'+ isbn def work_node(work, facet=None): - - - metadata = {"@type": "http://schema.org/EBook", + metadata = { + "@type": "http://schema.org/EBook", "id": "{base}{url}".format( base=UNGLUEIT_URL, - url=reverse('work_identifier', - kwargs={'work_id':work.id}) + url=reverse('work_identifier', kwargs={'work_id':work.id}) ) } links = [] @@ -73,7 +67,7 @@ def work_node(work, facet=None): } # title metadata["title"] = work.title - + # id links.append({ "rel": "self", @@ -84,18 +78,18 @@ def work_node(work, facet=None): ), "type": "application/opds-publication+json" }) - + updated = None - + # links for all ebooks - ebooks = facet.filter_model("Ebook",work.ebooks()) if facet else work.ebooks() + ebooks = facet.filter_model("Ebook", work.ebooks()) if facet else work.ebooks() versions = set() for ebook in ebooks: if updated is None: # most recent ebook, first ebook in loop updated = ebook.created.isoformat() - metadata['updated'] = updated + metadata['updated'] = updated if not ebook.version_label in versions: versions.add(ebook.version_label) # ebook.download_url is an absolute URL with the protocol, domain, and path baked in @@ -104,19 +98,19 @@ def work_node(work, facet=None): "href": add_query_component(ebook.download_url, "feed=opds"), "rights": str(ebook.rights) } - if ebook.is_direct(): + if ebook.is_direct(): acquire["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "") else: - """ indirect acquisition, i.e. google books """ + # indirect acquisition, i.e. google books acquire["type"] = "text/html" acquire["indirectAcquisition"] = { "type": FORMAT_TO_MIMETYPE.get(ebook.format) } if ebook.version_label: acquire["version"] = ebook.version_label - + acquires.append(acquire) - + # get the cover -- assume jpg? if work.cover_image_small(): cover_node = { @@ -130,80 +124,85 @@ def work_node(work, facet=None): "type": "image/"+work.cover_filetype(), } images.append(cover_node2) - - + + # 2012 metadata["issued"] = work.publication_date - + # author # TO DO: include all authors? - metadata["author"] = work.author() - + metadata["author"] = work.author() + # publisher #Open Book Publishers - if len(work.publishers()): - metadata["publishers"] = [{"publisher": publisher.name.name} - for publisher in work.publishers()] - + if work.publishers().exists(): + metadata["publishers"] = [ + {"publisher": publisher.name.name} for publisher in work.publishers() + ] # language metadata["language"] = work.language - + # description metadata["summary"] = work.description - + # identifiers if work.identifiers.filter(type='isbn'): - metadata['identifiers'] = [isbn_node(isbn.value) - for isbn in work.identifiers.filter(type='isbn')[0:9]] #10 should be more than enough + metadata['identifiers'] = [ + isbn_node(isbn.value) for isbn in work.identifiers.filter(type='isbn')[0:9] + ] # 10 should be more than enough - # subject tags - subjects = [subject.name for subject in work.subjects.all()] + subjects = [subject.name for subject in work.subjects.all()] if subjects: metadata["subjects"] = subjects # age level - # + # if work.age_level: age_level_node_attrib = {} age_level_node = {"category": age_level_node_attrib} - age_level_node_attrib["scheme"] = 'http://schema.org/typicalAgeRange' - age_level_node_attrib["term"] = work.age_level - age_level_node_attrib["label"] = work.get_age_level_display() + age_level_node_attrib["scheme"] = 'http://schema.org/typicalAgeRange' + age_level_node_attrib["term"] = work.age_level + age_level_node_attrib["label"] = work.get_age_level_display() metadata.update(age_level_node) - - - # rating + + + # rating metadata["rating"] = {"ratingValue":"{:}".format(work.priority())} return content class Facet: title = '' - works = None + works = models.Work.objects.none() feed_path = '' description = '' - + def feed(self, page=None, order_by='newest'): self.works = self.works.order_by(*facets.get_order_by(order_by)) return opds_feed_for_works(self, page=page, order_by=order_by) - + def updated(self): # return the creation date for most recently added item - if not self.works: + key = f"{self.feed_path.replace(' ', '_')}_updated" + if not self.works.exists(): return pytz.utc.localize(datetime.datetime.utcnow()).isoformat() - else: - return pytz.utc.localize(self.works[0].created).isoformat() + value = cache.get(key) + if value is None: + value = pytz.utc.localize(self.works.latest('created').created).isoformat() + cache.set(key, value, 100000) + return value def get_facet_facet(facet_path): class Facet_Facet(Facet): - + def __init__(self, facet_path=facet_path): self.feed_path = facet_path self.facet_object = facets.get_facet_object(facet_path) self.title = "Unglue.it" for facet in self.facet_object.facets(): self.title = self.title + " " + facet.title - self.works = self.facet_object.get_query_set().distinct() + self.works = self.facet_object.get_query_set() self.description = self.facet_object.description return Facet_Facet @@ -214,38 +213,37 @@ class NullFacet(facets.BaseFacet): def get_other_groups(self): return[] try: - works=models.Work.objects.filter(id=work_id) + works = models.Work.objects.filter(id=work_id) except models.Work.DoesNotExist: - works=models.Work.objects.none() + works = models.Work.objects.none() except ValueError: # not a valid work_id - works=models.Work.objects.none() - self.works=works - self.title='Unglue.it work #%s' % work_id - self.feed_path='' - self.facet_object= NullFacet(None) - return opds_feed_for_works( single_work_facet(work_id) ) + works = models.Work.objects.none() + self.works = works + self.title = 'Unglue.it work #%s' % work_id + self.feed_path = '' + self.facet_object = NullFacet(None) + return opds_feed_for_works(single_work_facet(work_id)) def opds_feed_for_works(the_facet, page=None, order_by='newest'): if order_by == 'none': books_per_page = 50000 + order_by = 'newest' else: books_per_page = 50 - works = the_facet.works + works = the_facet.works.distinct() feed_path = the_facet.feed_path title = the_facet.title metadata = {"title": title} links = [] - feedlist = [] - feed = {"@context": JSONCONTEXT, "metadata": metadata, "links": links, "publications": feedlist} - + # add title # TO DO: will need to calculate the number items and where in the feed we are - + metadata['title'] = title + ' - sorted by ' + order_by - + # links: start, self, next/prev (depending what's necessary -- to start with put all CC books) - + if not page: page = 0 else: @@ -255,37 +253,55 @@ def opds_feed_for_works(the_facet, page=None, order_by='newest'): page = 0 # self link - append_navlink(feed, 'self', feed_path, page , order_by, title="First {}".format(books_per_page)) - - # next link + append_navlink(links, 'self', feed_path, page, order_by, + title="First {}".format(books_per_page)) + + # next link try: works[books_per_page * page + books_per_page] - append_navlink(feed, 'next', feed_path, page+1 , order_by, - title="Next {}".format(books_per_page)) + append_navlink(links, 'next', feed_path, page+1, order_by, + title="Next {}".format(books_per_page)) except IndexError: pass - + # sort facets - append_navlink(feed, FACET_RELATION, feed_path, None, 'popular', group="Order", active = order_by=='popular', title="Sorted by popularity") - append_navlink(feed, FACET_RELATION, feed_path, None, 'newest', group="Order", active = order_by=='newest', title="Sorted by newest") - + append_navlink(links, FACET_RELATION, feed_path, None, 'popular', group="Order", + active=order_by == 'popular', title="Sorted by popularity") + append_navlink(links, FACET_RELATION, feed_path, None, 'newest', group="Order", + active=order_by == 'newest', title="Sorted by newest") + #other facets for other_group in the_facet.facet_object.get_other_groups(): for facet_object in other_group.get_facets(): - append_navlink(feed, FACET_RELATION, feed_path + '/' + facet_object.facet_name, None, order_by, group=other_group.title, title=facet_object.title) - - works = islice(works, books_per_page * page, books_per_page * page + books_per_page) + append_navlink( + links, FACET_RELATION, + feed_path + '/' + facet_object.facet_name, None, order_by, + group=other_group.title, title=facet_object.title + ) + + works = islice(works, books_per_page * page, books_per_page * page + books_per_page) if page > 0: - append_navlink(feed, 'previous', feed_path, page-1, order_by, title="Previous {}".format(books_per_page)) + append_navlink(links, 'previous', feed_path, page-1, order_by, + title="Previous {}".format(books_per_page)) + + yield '{' + f""" +"@context": {JSONCONTEXT}, +"metadata": {json.dumps(metadata, indent=2,)}, +"links": {json.dumps(links, indent=2,)}, +"publications": +[ +""" + for work in works: node = work_node(work, facet=the_facet.facet_object) - feedlist.append(node) - return json.dumps(feed,indent=2, separators=(',', ': '), sort_keys=False) + yield json.dumps(node, indent=2) + ',\r' + yield '\r]\r}' -def append_navlink(feed, rel, path, page, order_by, group=None, active=None , title=""): - link = { +def append_navlink(links, rel, path, page, order_by, group=None, active=None, title=""): + link = { "rel": rel, - "href": UNGLUEIT_URL + "/api/opdsjson/" + urlquote(path) + '/?order_by=' + order_by + ('&page=' + unicode(page) ), + "href": UNGLUEIT_URL + "/api/opdsjson/" + urlquote(path) + + '/?order_by=' + order_by + '&page=' + str(page), "type": ACQUISITION, "title": title, } @@ -294,4 +310,4 @@ def append_navlink(feed, rel, path, page, order_by, group=None, active=None , ti link['facetGroup'] = group if active: link['activeFacet'] = 'true' - feed['links'].append(link) \ No newline at end of file + links.append(link) diff --git a/api/resources.py b/api/resources.py deleted file mode 100755 index bab1fc0db..000000000 --- a/api/resources.py +++ /dev/null @@ -1,176 +0,0 @@ -import logging - -from tastypie import fields -from tastypie.constants import ALL, ALL_WITH_RELATIONS -from tastypie.resources import ModelResource, Resource, Bundle -from tastypie.utils import trailing_slash -from tastypie.authentication import ApiKeyAuthentication, Authentication -from tastypie.exceptions import BadRequest - -from django.conf.urls import url -from django.contrib import auth -from django.contrib.auth.models import User -from django.core.urlresolvers import reverse - -from regluit.core import models -import regluit.core.isbn - -logger = logging.getLogger(__name__) - - -class EditionResource(ModelResource): - work = fields.ForeignKey('regluit.api.resources.WorkResource', 'work') - identifiers = fields.ToManyField('regluit.api.resources.IdentifierResource', 'identifiers') - ebooks = fields.ToManyField('regluit.api.resources.EbookResource', 'ebooks') - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Edition.objects.all() - resource_name = 'edition' - filtering = { - "isbn_13": ALL, "identifiers": ALL_WITH_RELATIONS, - } - - def build_filters(self, filters = None, **kwargs): - if filters is None: - filters = {} - for filter_expr, value in filters.items(): - if filter_expr.startswith('isbn_13'): - filters['identifiers__type'] = 'isbn' - if len(filter_expr)>7: - filters['identifiers__value'+filter_expr[7:]] = value - else: - filters['identifiers__value'] = value - del filters[ filter_expr ] - return super(EditionResource, self).build_filters(filters) - -class IdentifierResource(ModelResource): - work = fields.ForeignKey('regluit.api.resources.WorkResource', 'work') - edition = fields.ForeignKey('regluit.api.resources.EditionResource', 'edition') - - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Identifier.objects.all() - resource_name = 'identifier' - filtering = { - "value": ALL, "type" : ALL, - } - - - -class WorkResource(ModelResource): - editions = fields.ToManyField(EditionResource, 'editions') - identifiers = fields.ToManyField(IdentifierResource, 'identifiers') - - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Work.objects.all() - resource_name = 'work' - filtering = {'editions': ALL_WITH_RELATIONS, 'id': ALL, 'identifiers': ALL_WITH_RELATIONS} - ordering = ['num_wishes'] - -class CampaignResource(ModelResource): - work = fields.ToOneField(WorkResource, 'work') - - def alter_list_data_to_serialize(self, request, data): - """ - annotate the list of campaigns with information from the logged in - user. note: this isn't the user identified by the api username/api_key - it's the the user that client might be logged into unglue.it as. - """ - u = auth.get_user(request) - if isinstance(u, User): - data['meta']['logged_in_username'] = u.username - wishlist_work_ids = [w.id for w in u.wishlist.works.all()] - else: - data['meta']['logged_in_username'] = None - wishlist_work_ids = [] - - for o in data['objects']: - o.data['in_wishlist'] = o.obj.work_id in wishlist_work_ids - # there's probably a better place up the chain (where the Campaign objects are directly available) to grab the status - c = models.Campaign.objects.get(id=o.data["id"]) - o.data['status'] = c.status - o.data['current_total'] = c.current_total - - # TODO: add pledging information - return data - - def alter_detail_data_to_serialize(self, request, obj): - c = models.Campaign.objects.get(id=obj.data["id"]) - obj.data['status'] = c.status - obj.data['current_total'] = c.current_total - return obj - - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Campaign.objects.all() - resource_name = 'campaign' - excludes = ['amazon_receiver', 'paypal_receiver'] - filtering = { - "work": ALL_WITH_RELATIONS, - } - -class AuthorResource(ModelResource): - works = fields.ToManyField(WorkResource, 'works') - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Author.objects.all() - resource_name = 'author' - -class SubjectResource(ModelResource): - works = fields.ToManyField(WorkResource, 'works') - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Subject.objects.all() - resource_name = 'subject' - -class EbookResource(ModelResource): - edition = fields.ToOneField(EditionResource, 'edition') - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Ebook.objects.all() - resource_name = 'ebook' - excludes = ['url'] - -class PublisherResource(ModelResource): - ebooks = fields.ToManyField(EbookResource, attribute=lambda bundle: models.Ebook.objects.filter(edition__publisher_name=bundle.obj.name)) - class Meta: - authentication = ApiKeyAuthentication() - queryset = models.Publisher.objects.all() - resource_name = 'publisher' - -class FreeResource(ModelResource): - def alter_list_data_to_serialize(self, request, data): - del data["meta"]["limit"] - del data["meta"]["offset"] - return data - - def dehydrate(self, bundle): - bundle.data["filetype"]=bundle.obj.format - bundle.data["rights"]=bundle.obj.rights - bundle.data["provider"]=bundle.obj.provider - bundle.data["href"]=reverse('download_ebook',kwargs={'ebook_id':bundle.obj.id}) - return bundle - - def obj_get_list(self, bundle, **kwargs): - request = bundle.request - isbn ="" - if hasattr(request, 'GET'): - isbn = request.GET.get("isbn","") - isbn = isbn.replace('-','') - if len(isbn)==10: - isbn=regluit.core.isbn.convert_10_to_13(isbn) - try: - work=models.Identifier.objects.get(type='isbn',value=isbn,).work - base_object_list = models.Ebook.objects.filter(edition__work=work) - return base_object_list - except ValueError: - raise BadRequest("Invalid resource lookup data provided (mismatched type).") - except models.Identifier.DoesNotExist: - return models.Ebook.objects.none() - - class Meta: - authentication = ApiKeyAuthentication() - fields = [ 'provider', 'rights' ] - limit = 0 - include_resource_uri = False diff --git a/api/templates/api_help.html b/api/templates/api_help.html index 97fd8239d..57389bbfe 100644 --- a/api/templates/api_help.html +++ b/api/templates/api_help.html @@ -5,13 +5,7 @@ {% block doccontent %}

API Help

-

Some of the data from Unglue.it is avaiable via a JSON API. You will need a key and username to be able to use the API. -

- {% if user.is_authenticated %} -

Welcome {{user.username}}. Your API key is {{api_key}}.

- {% else %} -

Please sign in first.

- {% endif %} +

Ebook Widgets

You don't need an API key to embed Ebook (HTML) widgets.

Here's a widget that displays Unglue.it's featured ebook - it changes most every day! {{base_url}}{% url 'widget' 'featured' %}. Copy/paste this into your site:
@@ -30,72 +24,35 @@

Ebook Widgets

{% if campaign %}
Here's a sample widget for the book {{campaign.name}} with ISBN {{campaign_isbn}}: {{base_url}}{% url 'widget' campaign_isbn %}. Copy/paste this into your site:
- - + +
{% endif %} - -

Basic API info

- - Available Resources (JSON) - -

- Resources on this list can be accessed via {{ base_url }}/api/v1/{resource_on_list}/?format=json&api_key={your_api_key}&username={your_username} . -

- -

- Works can be ordered by how often they've been favorited; append &order_by=num_wishes (ascending) or &order_by=-num_wishes (descending). -

-

Free Ebooks by ISBN

-

With an API key, you can check if there's a free ebook for any ISBN. ISBNs can be 10 or 13 digits, and can include dashes. This service returns all free-licensed ebooks for a work associated with an ISBN, and for each ebook includes information about file type, rights, and the provider hosting the file.

-

For example, here's how to get a list of ebook files for "Homeland".

-

{% if user.is_authenticated %} - JSON: {{base_url}}/api/v1/free/?isbn=9780765333698&format=json&api_key={your_api_key}&username={your_username} -
XML: {{base_url}}/api/v1/free/?isbn=9780765333698&format=xml&api_key={your_api_key}&username={your_username} - {% else %} - (Log in to see links) - {% endif %}

-

Identifier Resolution

-

Here's how to get work/edition data for an isbn

-

{% if user.is_authenticated %} -JSON: {{base_url}}/api/v1/identifier/?format=json&api_key={your_api_key}&username={your_username}&type=isbn&value=9780441012039
-XML: {{base_url}}/api/v1/identifier/?format=xml&api_key={your_api_key}&username={your_username}&type=isbn&value=9780441012039

{% else %} - (Log in to see links) - {% endif %}

- {% if user.is_authenticated %} -

Campaign info

-

Here's how to get data on all campaigns. if the user is logged in to Unglue.it, they can tell if the campaign book is on their fave list

-

JSON: {{base_url}}/api/v1/campaign/?format=json&api_key={your_api_key}&username={your_username}<
XML: {{base_url}}/api/v1/campaign/?format=json&api_key={your_api_key}&username={your_username}

-

Identifier Resolution

-

Here's how to get work/edition data for an isbn

-

JSON: {{base_url}}/api/v1/identifier/?format=json&api_key={your_api_key}&username={your_username}&type=isbn&value=9780441012039
- XML: {{base_url}}/api/v1/identifier/?format=xml&api_key={your_api_key}&username={your_username}&type=isbn&value=9780441012039

-

In addition to isbn, you can use 'goog' if you have a google books id, and 'oclc' for oclc numbers.

- {% endif %} -

OPDS Catalog Feeds

-

We have a basic implementation of OPDS feeds. You don't need a key to use them. The starting point is {{base_url}}{% url 'opds' %}

+

We have a basic implementation of OPDS feeds. You don't need a key to use them. The starting point is {{base_url}}{% url 'opds' %}. Use the page parameter to page through the results.

Examples:

filtered by format
-
{{base_url}}{% url 'opds_acqusition' 'epub' %}
+
{{base_url}}{% url 'opds_acqusition' 'epub' %}
filtered by license
-
{{base_url}}{% url 'opds_acqusition' 'by-sa' %}
+
{{base_url}}{% url 'opds_acqusition' 'by-sa' %}
filtered by title search
-
{{base_url}}{% url 'opds_acqusition' 's.open' %}
+
{{base_url}}{% url 'opds_acqusition' 's.open' %}
filtered by keyword
-
{{base_url}}{% url 'opds_acqusition' 'kw.fiction' %}
+
{{base_url}}{% url 'opds_acqusition' 'kw.fiction' %}
filtered by ungluer
-
{{base_url}}{% url 'opds_acqusition' '@eric' %}
+
{{base_url}}{% url 'opds_acqusition' '@eric' %}
+
filtered by having a Project Gutenberg or DOAB identifier (doab, gtbg)
+
{{base_url}}{% url 'opds_acqusition' 'doab/-gtbg' %}?page=1

There's also an OPDS record available for every work on unglue.it. For example, requesting, {{base_url}}{% url 'opds_acqusition' 'all'%}?work=13950 get you to the web page or opds record for A Christmas Carol.

- +

ONIX Catalog Feeds

-

There is an ONIX 3.0 feed corresponding to every facet of our free ebook lists. You don't need a key to use them. There is a maximum of 100 books per result you can change with the max parameter. For example, here are the first hundred CC BY-ND-ND licensed books available in EPUB.

+

There is an ONIX 3.0 feed corresponding to every facet of our free ebook lists. You don't need a key to use them. There is a maximum of 100 books per result you can change with the max parameter. For example, here are the first twenty CC BY-ND-ND licensed books available in EPUB. Pages of 30 records each are available via the page parameter. Here's the first page of books from the Directory of Open Access Books.

There's also an ONIX record available for every free ebook on unglue.it. For example, here is Issues in Open Research Data.

- +

Identifiers with Content type negotiation

There's a URI to identify every work used in OPDS feeds. HTTP content negotiation is used for these ids, so requesting application/atom+xml;profile=opds-catalog;kind=acquisition for {{base_url}}{% url 'work_identifier' '13950' %} get you to the web page or opds record for A Christmas Carol. requesting text/xml gets you the onix record. Otherwise, you get the normal html page.

diff --git a/api/templates/editions.html b/api/templates/editions.html index 71e815320..10fb0f8e0 100644 --- a/api/templates/editions.html +++ b/api/templates/editions.html @@ -8,7 +8,7 @@ - {% if editions %} + {% if editions.exists %}
    {% for edition in editions %}
  • {{edition.id}} | {{edition.title}} | @@ -22,5 +22,3 @@ - - diff --git a/api/templates/load_yaml.html b/api/templates/load_yaml.html index a66be387a..9826a3e95 100644 --- a/api/templates/load_yaml.html +++ b/api/templates/load_yaml.html @@ -14,5 +14,3 @@ - - diff --git a/api/templates/opds.json b/api/templates/opds.json index 06d209124..aca25025d 100644 --- a/api/templates/opds.json +++ b/api/templates/opds.json @@ -11,7 +11,7 @@ {"title": "{{ feed.title }} - Popular", "href": "{{ feed.feed_path|urlencode }}/?order_by=popular", "type": "application/opds+json"}, {"title": "{{ feed.title }} - New", "href": "{{ feed.feed_path|urlencode }}/?order_by=newest", "type": "application/opds+json" }, {% for feed in feeds %} - {"title": "{{ feed.title }}", "href": "{{ feed.feed_path|urlencode }}/", "type": "application/opds+json" }, + {"title": "{{ feed.title }}", "href": "{{ feed.feed_path|urlencode }}/", "type": "application/opds+json" }{% if not forloop.last %},{% endif %} {% endfor %} ] } diff --git a/api/templates/opds.xml b/api/templates/opds.xml index 30cb09304..7524c6f63 100644 --- a/api/templates/opds.xml +++ b/api/templates/opds.xml @@ -4,7 +4,7 @@ xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"> Unglue.it Catalog https://unglue.it{% url 'opds' %} - 2014-06-13T21:48:34Z + {{ feed.updated }} unglue.it https://unglue.it/ @@ -45,4 +45,4 @@ {{ feed.description }} {% endfor %} - \ No newline at end of file + diff --git a/api/templates/widget.html b/api/templates/widget.html index 657da2372..ff6f52385 100644 --- a/api/templates/widget.html +++ b/api/templates/widget.html @@ -43,5 +43,3 @@ - - diff --git a/api/tests.py b/api/tests.py index 3e5e39f7a..a47b4c795 100755 --- a/api/tests.py +++ b/api/tests.py @@ -23,13 +23,13 @@ class ApiTests(TestCase): fixtures = ['initial_data.json', 'neuromancer.json'] work_id=None - + def setUp(self): edition = models.Edition.objects.get(pk=1) self.work_id=edition.work_id campaign = models.Campaign.objects.create( name=edition.work.title, - work=edition.work, + work=edition.work, description='Test Campaign', deadline=now(), target=Decimal('1000.00'), @@ -38,112 +38,12 @@ def setUp(self): self.client = Client() ebook = models.Ebook.objects.create( url="http://example.com/ebook", - provider="github", + provider="github", rights='CC BY', format='epub', edition=edition, ) - def test_user(self): - self.assertEqual(User.objects.all().count(), 1) - self.assertTrue(User.objects.all()[0].api_key.key) - - def test_no_auth(self): - r = self.client.get('/api/v1/campaign/', data={'format': 'json'}) - self.assertEqual(r.status_code, 401) - - def test_campaigns(self): - r = self.client.get('/api/v1/campaign/', data={ - 'format': 'json', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - self.assertEqual(r.status_code, 200) - j = json.loads(r.content) - self.assertEqual(len(j['objects']), 1) - self.assertEqual(j['objects'][0]['name'], 'Neuromancer') - self.assertEqual(j['objects'][0]['work'], '/api/v1/work/%s/' % self.work_id) - resource_uri=j['objects'][0]['resource_uri'] - r = self.client.get( resource_uri, data={ - 'format': 'json', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - self.assertEqual(r.status_code, 200) - j = json.loads(r.content) - self.assertEqual(j['name'], 'Neuromancer') - self.assertEqual(j['work'], '/api/v1/work/%s/' % self.work_id) - - def test_campaign_lookup_by_isbn(self): - r = self.client.get('/api/v1/campaign/', data={ - 'format': 'json', - 'work__identifiers__value': regluit.core.isbn.convert_10_to_13('0441007465'), - 'work__identifiers__type': 'isbn', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - self.assertEqual(r.status_code, 200) - j = json.loads(r.content) - self.assertEqual(len(j['objects']), 1) - self.assertEqual(j['objects'][0]['name'], 'Neuromancer') - self.assertEqual(j['meta']['logged_in_username'], None) - self.assertEqual(j['objects'][0]['in_wishlist'], False) - - def test_identifier_lookup(self): - r = self.client.get('/api/v1/identifier/', data={ - 'format': 'json', - 'value': regluit.core.isbn.convert_10_to_13('0441007465'), - 'type': 'isbn', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - self.assertEqual(r.status_code, 200) - - def test_logged_in_user_info(self): - # login and see if adding a work to the users wishlist causes - # it to show up as in_wishlist in the campaign info - self.client.login(username='test', password='testpass') - - r = self.client.get('/api/v1/campaign/', data={ - 'format': 'json', - 'work__identifiers__value': regluit.core.isbn.convert_10_to_13('0441007465'), - 'work__identifiers__type': 'isbn', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - j = json.loads(r.content) - self.assertEqual(j['meta']['logged_in_username'], 'test') - self.assertEqual(j['objects'][0]['in_wishlist'], False) - - w = models.Work.objects.get(identifiers__value=regluit.core.isbn.convert_10_to_13('0441007465'), identifiers__type='isbn') - self.user.wishlist.add_work(w,'test') - r = self.client.get('/api/v1/campaign/', data={ - 'format': 'json', - 'work__identifiers__value': regluit.core.isbn.convert_10_to_13('0441007465'), - 'work__identifiers__type': 'isbn', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - j = json.loads(r.content) - self.assertEqual(j['meta']['logged_in_username'], 'test') - self.assertEqual(j['objects'][0]['in_wishlist'], True) - - r = self.client.get('/api/v1/free/', data={ - 'format': 'json', - 'isbn': '0441007465', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - j = json.loads(r.content) - self.assertEqual(j['objects'][0]['filetype'], 'epub') - r = self.client.get('/api/v1/free/', data={ - 'format': 'xml', - 'isbn': '0441007465', - 'username': self.user.username, - 'api_key': self.user.api_key.key - }) - self.assertTrue(r.content.find('CC BY')>0) - def test_widget(self): r = self.client.get('/api/widget/0441007465/') self.assertEqual(r.status_code, 200) @@ -169,6 +69,27 @@ def test_opds(self): r = self.client.get('/api/opds/?work=%s' % self.test_work_id) self.assertEqual(r.status_code, 200) + def test_opds_all_keyword_alias_works(self): + r = self.client.get('/api/opds/all/kw.Fiction/') + self.assertEqual(r.status_code, 200) + + def test_opds_keyword_compound_returns_404(self): + r = self.client.get('/api/opds/kw.Fiction/epub/') + self.assertEqual(r.status_code, 404) + + def test_opds_single_keyword_works(self): + r = self.client.get('/api/opds/kw.Fiction/') + self.assertEqual(r.status_code, 200) + + def test_opdsjson_keyword_compound_returns_404(self): + r = self.client.get('/api/opdsjson/kw.Fiction/epub/') + self.assertEqual(r.status_code, 404) + + def test_onix_keyword_compound_returns_404(self): + r = self.client.get('/api/onix/kw.Fiction/epub/') + self.assertEqual(r.status_code, 404) + + def test_nix(self): r = self.client.get('/api/onix/by/') self.assertEqual(r.status_code, 200) @@ -179,6 +100,10 @@ def test_nix(self): r = self.client.get('/api/onix/?work=%s' % self.test_work_id) self.assertEqual(r.status_code, 200) + def test_onix_all_keyword_alias_works(self): + r = self.client.get('/api/onix/all/kw.Fiction/') + self.assertEqual(r.status_code, 200) + class AllowedRepoTests(TestCase): def setUp(self): apimodels.AllowedRepo.objects.create(org='test',repo_name='test') @@ -205,7 +130,7 @@ def test_travisci_webhook(self): """ payload = json.dumps({ - "repository":{ + "repository":{ "id":4651401, "name":"Adventures-of-Huckleberry-Finn_76", "owner_name":"GITenberg", @@ -214,9 +139,9 @@ def test_travisci_webhook(self): "status_message": "Passed", "type": "push" }) - + invalid_payload = json.dumps({ - "repository":{ + "repository":{ "id":4651401, "name":"", "url":"http://GITenberg.github.com/" @@ -230,13 +155,11 @@ def test_travisci_webhook(self): # 200 if a simple get r = self.client.get(url) self.assertEqual(r.status_code, 200) - + # 200 when we actually load a valid repo (should we use 201?) r = self.client.post(url, data={'payload':payload}, headers={}, allow_redirects=True) self.assertEqual(r.status_code, 200) - + # 400 error if we get exception when trying to load a book r = self.client.post(url, data={'payload':invalid_payload}, headers={}, allow_redirects=True) self.assertEqual(r.status_code, 400) - - diff --git a/api/urls.py b/api/urls.py index ec34a6a96..6bab522cd 100644 --- a/api/urls.py +++ b/api/urls.py @@ -1,33 +1,19 @@ -from tastypie.api import Api - -from django.conf.urls import patterns, url, include +from django.conf.urls import url from django.views.generic.base import TemplateView -from regluit.api import resources from regluit.api.views import ApiHelpView from regluit.api.views import OPDSNavigationView, OPDSAcquisitionView from regluit.api.views import OnixView from regluit.api.views import ( - travisci_webhook, - load_yaml, - negotiate_content, - widget, + travisci_webhook, + load_yaml, + negotiate_content, + widget, featured_cover, featured_url, ) -v1_api = Api(api_name='v1') -v1_api.register(resources.WorkResource()) -v1_api.register(resources.IdentifierResource()) -v1_api.register(resources.EditionResource()) -v1_api.register(resources.CampaignResource()) -v1_api.register(resources.AuthorResource()) -v1_api.register(resources.SubjectResource()) -v1_api.register(resources.FreeResource()) -v1_api.register(resources.PublisherResource()) -v1_api.register(resources.EbookResource()) - urlpatterns = [ url(r'^help$', ApiHelpView.as_view(), name="api_help"), url(r'^widget/(?P\w+)/$', widget, name="widget"), @@ -42,5 +28,4 @@ url(r'^id/work/(?P\w+)/$', negotiate_content, name="work_identifier"), url(r'^loader/yaml$', load_yaml, name="load_yaml"), url(r'^travisci/webhook$', travisci_webhook, name="travisci_webhook"), - url(r'^', include(v1_api.urls)), ] diff --git a/api/views.py b/api/views.py index 5a3c2f3de..ca52528bf 100755 --- a/api/views.py +++ b/api/views.py @@ -1,13 +1,11 @@ -from tastypie.models import ApiKey - -import json +import json as json_module import logging from django.contrib import auth from django.contrib.auth.models import User from django.contrib.sites.models import Site -from django.core.urlresolvers import reverse -from django.shortcuts import render, render_to_response +from django.urls import reverse +from django.shortcuts import render from django.template import RequestContext from django.views.decorators.csrf import csrf_exempt from django.views.generic.base import View, TemplateView @@ -17,25 +15,29 @@ HttpResponseBadRequest, HttpResponseRedirect, Http404, + StreamingHttpResponse, ) import regluit.core.isbn -from regluit.core.bookloader import load_from_yaml from regluit.api import opds, onix, opds_json from regluit.api.models import repo_allowed - +from regluit.core.bookloader import load_from_yaml +from regluit.core.covers import DEFAULT_COVER from regluit.core import models +from regluit.core.facets import InvalidFacetCombination +from regluit.core.parameters import ORDER_BY_KEYS logger = logging.getLogger(__name__) +ANONYMOUS_MAX_RECORDS = 100 def editions(request): editions = models.Edition.objects.all() - return render(request, 'editions.html', + return render(request, 'editions.html', {'editions':editions}, - ) + ) def negotiate_content(request,work_id): if request.META.get('HTTP_ACCEPT', None): @@ -43,7 +45,7 @@ def negotiate_content(request,work_id): return HttpResponseRedirect(reverse('opds_acqusition',args=['all'])+'?work='+work_id) elif "text/xml" in request.META['HTTP_ACCEPT']: return HttpResponseRedirect(reverse('onix',args=['all'])+'?work='+work_id) - + return HttpResponseRedirect(reverse('work', kwargs={'work_id': work_id})) def featured_work(): @@ -51,17 +53,17 @@ def featured_work(): work = models.Work.objects.filter(featured__isnull=False).distinct().order_by('-featured')[0] except: #shouldn't occur except in tests - work = models.Work.objects.all()[0] + work = models.Work.objects.first() return work def widget(request, isbn): """ supply info for book panel. parameter is named isbn for historical reasons. can be isbn or work_id """ - + if isbn == 'featured': work = featured_work() - else : + else : if len(isbn)==10: isbn = regluit.core.isbn.convert_10_to_13(isbn) if len(isbn)==13: @@ -69,19 +71,19 @@ def widget(request, isbn): identifier = models.Identifier.objects.get(type = 'isbn', value = isbn ) work = identifier.work except models.Identifier.DoesNotExist: - return render(request, 'widget.html', - { 'work':None,}, + return render(request, 'widget.html', + { 'work':None,}, ) else: work= models.safe_get_work(isbn) - return render(request, 'widget.html', - {'work':work, }, + return render(request, 'widget.html', + {'work':work, }, ) def featured_cover(request): work = featured_work() tn = work.cover_image_thumbnail() - return HttpResponseRedirect(tn if tn else "/static/images/generic_cover_larger.png") + return HttpResponseRedirect(tn if tn else DEFAULT_COVER) def featured_url(request): work = featured_work() @@ -99,63 +101,57 @@ def load_yaml(request): try: work_id = load_from_yaml(repo_url) return HttpResponseRedirect(reverse('work', args=[work_id])) - except: + except: return HttpResponse('unsuccessful') - -@csrf_exempt + +@csrf_exempt def travisci_webhook(request): """ Respond to travis-ci webhooks from Project GITenberg repositories. If the webhook is successfully parsed, the metdata.yaml for the repository is loaded using load_from_yaml. https://docs.travis-ci.com/user/notifications/#Webhook-notification - + """ if request.method == "POST": - + try: - - data = json.loads(request.POST.get('payload')) + + data = json_module.loads(request.POST.get('payload')) # example of URL to feed to yaml loader: # https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/raw/master/metadata.yaml - + if data['status_message'] == 'Passed' and data['type'] == 'push': - + # another way to get owner_name / name would be request.META.get('HTTP_TRAVIS_REPO_SLUG', '') repo_url = "https://github.com/{}/{}/raw/master/metadata.yaml".format(data['repository']['owner_name'], data['repository']['name']) - + work_id = load_from_yaml(repo_url) return HttpResponse('Successful. work_id: {}'.format(work_id)) - + except Exception as e: - return HttpResponseBadRequest('Unsuccessful. Exception: {}'.format(unicode(e))) - + return HttpResponseBadRequest('Unsuccessful. Exception: {}'.format(str(e))) + else: - + return HttpResponse('No action') - + else: return HttpResponse('No action') - - - + + + class ApiHelpView(TemplateView): template_name = "api_help.html" def get_context_data(self, **kwargs): context = super(ApiHelpView, self).get_context_data(**kwargs) - + # base_url passed in to allow us to write absolute URLs for this site base_url = self.request.build_absolute_uri("/")[:-1] context["base_url"] = base_url - - # if user is logged in, pass in the user's API key - u = auth.get_user(self.request) - if u.is_authenticated(): - api_key = ApiKey.objects.filter(user=u)[0].key - context['api_key'] = api_key - + # pass in a sample Campaign whose widget can be displayed campaigns = models.Campaign.objects.all() if len(campaigns): @@ -164,22 +160,22 @@ def get_context_data(self, **kwargs): context["campaign"] = campaigns[0] context["campaign_isbn"] = isbn - return context + return context class OPDSNavigationView(TemplateView): - json=False + json = False # https://stackoverflow.com/a/6867976: secret to how to change content-type - + def render_to_response(self, context, **response_kwargs): - if json: + if self.json: response_kwargs['content_type'] = "application/vnd.opds.navigation+json" else: - response_kwargs['content_type'] = "application/atom+xml;profile=opds-catalog;kind=navigation" + response_kwargs['content_type'] = opds.NAVIGATION return super(TemplateView, self).render_to_response(context, **response_kwargs) - + def get_context_data(self, **kwargs): context = super(OPDSNavigationView, self).get_context_data(**kwargs) - if json: + if self.json: context["feeds"] = opds_json.feeds() context["feed"] = opds_json.get_facet_facet('all') else: @@ -193,47 +189,69 @@ def get(self, request, *args, **kwargs): work = request.GET.get('work', None) if work: if self.json: - return HttpResponse(opds_json.opds_feed_for_work(work), + return StreamingHttpResponse(opds_json.opds_feed_for_work(work), content_type="application/opds-publication+json") else: - return HttpResponse(opds.opds_feed_for_work(work), - content_type="application/atom+xml;profile=opds-catalog;kind=acquisition") + return StreamingHttpResponse(opds.opds_feed_for_work(work), + content_type=opds.ACQUISITION) facet = kwargs.get('facet') page = request.GET.get('page', None) order_by = request.GET.get('order_by', 'newest') + + # robots occasionally mangle order_by + order_by = order_by if order_by in ORDER_BY_KEYS else 'newest' + try: page = int(page) except: page = None - if self.json: - facet_class = opds_json.get_facet_class(facet)() - return HttpResponse(facet_class.feed(page,order_by), - content_type="application/opds+json") - else: - facet_class = opds.get_facet_class(facet)() - return HttpResponse(facet_class.feed(page,order_by), - content_type="application/atom+xml;profile=opds-catalog;kind=acquisition") - + try: + if self.json: + facet_class = opds_json.get_facet_class(facet)() + return StreamingHttpResponse(facet_class.feed(page,order_by), + content_type="application/opds+json; charset=utf-8") + else: + facet_class = opds.get_facet_class(facet)() + return StreamingHttpResponse(facet_class.feed(page,order_by), + content_type=opds.ACQUISITION) + except InvalidFacetCombination: + raise Http404("Compound keyword facet URLs are not supported.") class OnixView(View): - def get(self, request, *args, **kwargs): work = request.GET.get('work', None) + if work: try: - work=models.safe_get_work(work) + work = models.safe_get_work(work) except models.Work.DoesNotExist: - raise Http404 - return HttpResponse(onix.onix_feed_for_work(work), - content_type="text/xml") + raise Http404 + return HttpResponse(onix.onix_feed_for_work(work), content_type="text/xml") + facet = kwargs.get('facet', 'all') - if facet: - max = request.GET.get('max', 100) - try: - max = int(max) - except: - max = None + + if not facet: + return HttpResponseBadRequest(content='No facet provided') + + max_records = request.GET.get('max', ANONYMOUS_MAX_RECORDS) + + try: + max_records = int(max_records) + except Exception: + max_records = ANONYMOUS_MAX_RECORDS + + max_records = max_records if request.user.is_authenticated else ANONYMOUS_MAX_RECORDS + + try: facet_class = opds.get_facet_class(facet)() - return HttpResponse(onix.onix_feed(facet_class, max), - content_type="text/xml") + except InvalidFacetCombination: + raise Http404("Compound keyword facet URLs are not supported.") + page = request.GET.get('page', None) + try: + page = int(page) + except: + page = None + + feed = onix.onix_feed(facet_class, max_records, page_number=page) + return StreamingHttpResponse(feed, content_type="text/xml") diff --git a/bisac/__init__.py b/bisac/__init__.py index 554feb6d8..3a7fe611f 100644 --- a/bisac/__init__.py +++ b/bisac/__init__.py @@ -1,5 +1,4 @@ # data from https://github.com/edsu/bisac - class Bisac(object): def __init__(self): @@ -16,7 +15,6 @@ def code(self, subject): return top return bisac.get(subject, {}).get('notation','') - bisac= { "Religion / Christian Life / Social Issues": { "related": [], @@ -23130,4 +23128,215 @@ def code(self, subject): "notation": "HIS042000", "alt_label": [] }, + "Political Science / Public Policy / Economic Policy": { + "related": [], + "pref_label": "Political Science / Public Policy / Economic Policy", + "notation": "POL024000", + "alt_label": [] + }, + "Biography & Autobiography / Science & Technology": { + "related": [], + "pref_label": "Biography & Autobiography / Science & Technology", + "notation": "BIO015000", + "alt_label": [] + }, + "History / Middle East / Iran": { + "related": [], + "pref_label": "History / Middle East / Iran", + "notation": "HIS026020", + "alt_label": [] + }, + "History / Europe / Spain & Portugal": { + "related": [], + "pref_label": "History / Europe / Spain & Portugal", + "notation": "HIS045000", + "alt_label": [] + }, + "History / African American": { + "related": [], + "pref_label": "History / African American", + "notation": "HIS056000", + "alt_label": [] + }, + "History / Women": { + "related": [], + "pref_label": "History / Women", + "notation": "HIS058000", + "alt_label": [] + }, + "History / Europe / Poland": { + "related": [], + "pref_label": "History / Europe / Poland", + "notation": "HIS060000", + "alt_label": [] + }, + "Language Arts & Disciplines / Literacy": { + "related": [], + "pref_label": "Language Arts & Disciplines / Literacy", + "notation": "LAN010000", + "alt_label": [] + }, + "Language Arts & Disciplines / Linguistics / Sociolinguistics": { + "related": [], + "pref_label": "Language Arts & Disciplines / Linguistics / Sociolinguistics", + "notation": "LAN009050", + "alt_label": [] + }, + "Language Arts & Disciplines / Library & Information Science / Digital & Online Resources": { + "related": [], + "pref_label": "Language Arts & Disciplines / Library & Information Science / Digital & Online Resources", + "notation": "LAN025060", + "alt_label": [] + }, + "Literary Criticism / European / Eastern": { + "related": [], + "pref_label": "Literary Criticism / European / Eastern", + "notation": "LIT004110", + "alt_label": [] + }, + "Literary Criticism / Comparative Literature": { + "related": [], + "pref_label": "Literary Criticism / Comparative Literature", + "notation": "LIT020000", + "alt_label": [] + }, + "Literary Criticism / Modern / General": { + "related": [], + "pref_label": "Literary Criticism / Modern / General", + "notation": "LIT024000", + "alt_label": [] + }, + "Literary Criticism / Modern / 16th Century": { + "related": [], + "pref_label": "Literary Criticism / Modern / 16th Century", + "notation": "LIT024010", + "alt_label": [] + }, + "Literary Criticism / Modern / 17th Century": { + "related": [], + "pref_label": "Literary Criticism / Modern / 17th Century", + "notation": "LIT024020", + "alt_label": [] + }, + "Literary Criticism / Modern / 18th Century": { + "related": [], + "pref_label": "Literary Criticism / Modern / 18th Century", + "notation": "LIT024030", + "alt_label": [] + }, + "Literary Criticism / Modern / 19th Century": { + "related": [], + "pref_label": "Literary Criticism / Modern / 19th Century", + "notation": "LIT024040", + "alt_label": [] + }, + "Literary Criticism / Modern / 20th Century": { + "related": [], + "pref_label": "Literary Criticism / Modern / 20th Century", + "notation": "LIT024050", + "alt_label": [] + }, + "Literary Criticism / Modern / 21st Century": { + "related": [], + "pref_label": "Literary Criticism / Modern / 21st Century", + "notation": "LIT024060", + "alt_label": [] + }, + "Political Science / Security (National & International)": { + "related": [], + "pref_label": "Political Science / Security (National & International)", + "notation": "POL012000", + "alt_label": [] + }, + "Political Science / Intergovernmental Organizations": { + "related": [], + "pref_label": "Political Science / Intergovernmental Organizations", + "notation": "POL048000", + "alt_label": [] + }, + "Political Science / Genocide & War Crimes": { + "related": [], + "pref_label": "Political Science / Genocide & War Crimes", + "notation": "POL061000", + "alt_label": [] + }, + "Political Science / Geopolitics": { + "related": [], + "pref_label": "Political Science / Geopolitics", + "notation": "POL062000", + "alt_label": [] + }, + "Political Science / Political Process / Media & Internet": { + "related": [], + "pref_label": "Political Science / Political Process / Media & Internet", + "notation": "POL065000", + "alt_label": [] + }, + "Political Science / Public Policy / Military Policy": { + "related": [], + "pref_label": "Political Science / Public Policy / Military Policy", + "notation": "POL069000", + "alt_label": [] + }, + "Psychology / Animal & Comparative Psychology": { + "related": [], + "pref_label": "Psychology / Animal & Comparative Psychology", + "notation": "PSY054000", + "alt_label": [] + }, + "Religion / Buddhism / General": { + "related": [], + "pref_label": "Religion / Buddhism / General", + "notation": "REL007000", + "alt_label": [] + }, + "Science / Environmental Science": { + "related": [], + "pref_label": "Science / Environmental Science", + "notation": "SCI026000", + "alt_label": [] + }, + "Science / Ethics": { + "related": [], + "pref_label": "Science / Ethics", + "notation": "SCI101000", + "alt_label": [] + }, + "Social Science / Sociology / Social Theory": { + "related": [], + "pref_label": "Social Science / Sociology / Social Theory", + "notation": "SOC026040", + "alt_label": [] + }, + "Social Science / Indigenous Studies": { + "related": [], + "pref_label": "Social Science / Indigenous Studies", + "notation": "SOC062000", + "alt_label": [] + }, + "Technology & Engineering / Electronics / Circuits / General": { + "related": [], + "pref_label": "Technology & Engineering / Electronics / Circuits / General", + "notation": "TEC008010", + "alt_label": [] + }, + "History / Modern / 19th Century": { + "related": [], + "pref_label": "History / Modern / 19th Century", + "notation": "HIS037060", + "alt_label": [] + }, + "History / Europe / Greece": { + "related": [], + "pref_label": "History / Europe / Greece", + "notation": "HIS042000", + "alt_label": [] + }, + "History / Social History": { + "related": [], + "pref_label": "History / Social History", + "notation": "HIS054000", + "alt_label": [] + }, + } \ No newline at end of file diff --git a/bisac/management/commands/load_bisac.py b/bisac/management/commands/load_bisac.py index 123e4e5ea..8ef420c1b 100644 --- a/bisac/management/commands/load_bisac.py +++ b/bisac/management/commands/load_bisac.py @@ -9,4 +9,4 @@ class Command(BaseCommand): def handle(self, **options): populate_bisac_headings() attach_dangling_branches() - print "bisac table is ready" + print("bisac table is ready") diff --git a/bisac/migrations/0001_initial.py b/bisac/migrations/0001_initial.py index 0baaf7b1c..bd0d51d7c 100644 --- a/bisac/migrations/0001_initial.py +++ b/bisac/migrations/0001_initial.py @@ -22,7 +22,7 @@ class Migration(migrations.Migration): ('rght', models.PositiveIntegerField(editable=False, db_index=True)), ('tree_id', models.PositiveIntegerField(editable=False, db_index=True)), ('level', models.PositiveIntegerField(editable=False, db_index=True)), - ('parent', mptt.fields.TreeForeignKey(related_name='children', blank=True, to='bisac.BisacHeading', null=True)), + ('parent', mptt.fields.TreeForeignKey(on_delete=models.CASCADE, related_name='children', blank=True, to='bisac.BisacHeading', null=True)), ], options={ 'abstract': False, diff --git a/bisac/models.py b/bisac/models.py index 37c12ffd1..c379cec69 100644 --- a/bisac/models.py +++ b/bisac/models.py @@ -6,13 +6,24 @@ class BisacHeading(MPTTModel): full_label = models.CharField(max_length=100, unique=True) label = models.CharField(max_length=60, unique=False) notation = models.CharField(max_length=9, unique=False) - parent = TreeForeignKey('self', null=True, blank=True, related_name='children', db_index=True) + parent = TreeForeignKey('self', on_delete=models.CASCADE, null=True, blank=True, related_name='children', db_index=True) class MPTTMeta: order_insertion_by = ['notation'] - def __unicode__(self): + def __str__(self): return self.full_label + +def interpret_notation(notation): + #translate a notation + if notation: + try: + bisac_heading = BisacHeading.objects.get(notation=notation) + return bisac_heading.full_label + except BisacHeading.DoesNotExist: + pass + return notation + def populate_bisac_headings(): for key in bisac.keys(): diff --git a/bisac/urls.py b/bisac/urls.py index cf368f861..bb5e55f98 100644 --- a/bisac/urls.py +++ b/bisac/urls.py @@ -1,4 +1,4 @@ -from django.conf.urls import patterns, url, include +from django.conf.urls import url from .views import tree urlpatterns = [ diff --git a/bookdata/sitemaps.txt b/bookdata/sitemaps.txt index b91f2814e..5295a956c 100644 --- a/bookdata/sitemaps.txt +++ b/bookdata/sitemaps.txt @@ -1,9 +1,19 @@ -https://www.ubiquitypress.com/sitemap.xml -https://www.kriterium.se/sitemap.xml +https://aperio.press/sitemap.xml +https://hup.fi/sitemap.xml +https://iitikship.iiti.ac.in/sitemap.xml https://oa.finlit.fi/sitemap.xml -https://www.humanities-map.net/sitemap.xml https://oa.psupress.org/sitemap.xml +https://press.lse.ac.uk/sitemap.xml +https://press.sjms.nu/sitemap.xml +https://publishing.vt.edu/sitemap.xml +https://universitypress.whiterose.ac.uk/sitemap.xml +https://utsepress.lib.uts.edu.au/sitemap.xml +https://www.humanities-map.net/sitemap.xml +https://www.kriterium.se/sitemap.xml https://www.larcommons.net/sitemap.xml -https://www.uwestminsterpress.co.uk/sitemap.xml +https://www.luminosoa.org/sitemap.xml +https://www.mwv-open.de/sitemap.xml https://www.stockholmuniversitypress.se/sitemap.xml -https://www.luminosoa.org/sitemap.xml \ No newline at end of file +https://www.ubiquitypress.com/sitemap.xml +https://www.uwestminsterpress.co.uk/sitemap.xml +https://www.winchesteruniversitypress.org/sitemap.xml diff --git a/booxtream/__init__.py b/booxtream/__init__.py index 4c3d9cca7..60658ed05 100644 --- a/booxtream/__init__.py +++ b/booxtream/__init__.py @@ -1,6 +1,6 @@ import random from functools import partial -from urllib import quote +from urllib.parse import quote from xml.etree import ElementTree import requests @@ -64,7 +64,6 @@ def platform(self, epubfile=None, epub=True, kf8mobi=False, **kwargs): # fake it, so you can test other functions without hitting booxtream boox = Boox.objects.create( download_link_epub='https://github.com/eshellman/42_ebook/blob/master/download/42.epub?raw=true&extra=download.booxtream.com/', - download_link_mobi='https://github.com/eshellman/42_ebook/blob/master/download/42.mobi?raw=true', referenceid= kwargs.get('referenceid', '42'), downloads_remaining=kwargs.get('downloadlimit', 10), expirydays=kwargs.get('expirydays', 30), @@ -81,12 +80,8 @@ def platform(self, epubfile=None, epub=True, kf8mobi=False, **kwargs): download_link_epub = doc.find('.//DownloadLink[@type="epub"]') if download_link_epub is not None: download_link_epub = download_link_epub.text - download_link_mobi = doc.find('.//DownloadLink[@type="mobi"]') - if download_link_mobi is not None: - download_link_mobi = download_link_mobi.text boox = Boox.objects.create( download_link_epub=download_link_epub, - download_link_mobi=download_link_mobi, referenceid=kwargs.get('referenceid'), downloads_remaining=kwargs.get('downloadlimit'), expirydays=kwargs.get('expirydays'), diff --git a/booxtream/migrations/0002_remove_boox_download_link_mobi.py b/booxtream/migrations/0002_remove_boox_download_link_mobi.py new file mode 100644 index 000000000..668870ff7 --- /dev/null +++ b/booxtream/migrations/0002_remove_boox_download_link_mobi.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2022-07-28 06:16 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('booxtream', '0001_initial'), + ] + + operations = [ + migrations.RemoveField( + model_name='boox', + name='download_link_mobi', + ), + ] diff --git a/booxtream/models.py b/booxtream/models.py index cbf2e6066..09ccbea2b 100644 --- a/booxtream/models.py +++ b/booxtream/models.py @@ -7,7 +7,6 @@ class Boox(models.Model): keeps a record of a file that's been watermarked """ download_link_epub = models.URLField(null=True) - download_link_mobi = models.URLField(null=True) referenceid = models.CharField(max_length=32) downloads_remaining = models.PositiveSmallIntegerField(default=0) expirydays = models.PositiveSmallIntegerField() @@ -20,7 +19,5 @@ def expired(self): def download_link(self, format): if format == 'epub': return self.download_link_epub - elif format == 'mobi': - return self.download_link_mobi return None diff --git a/booxtream/tests.py b/booxtream/tests.py index c7cf266ff..6f27e65d1 100644 --- a/booxtream/tests.py +++ b/booxtream/tests.py @@ -1,19 +1,19 @@ import unittest import time -import urllib2 +from urllib.request import urlopen from tempfile import NamedTemporaryFile -from StringIO import StringIO +from io import BytesIO from django.conf import settings class TestBooXtream(unittest.TestCase): def setUp(self): # get a small epub test file as a file-like object self.epub2file = NamedTemporaryFile(delete=False) - test_file_content = urllib2.urlopen('http://www.hxa.name/articles/content/EpubGuide-hxa7241.epub') + test_file_content = urlopen('https://www.hxa.name/articles/content/EpubGuide-hxa7241.epub') self.epub2file.write(test_file_content.read()) self.epub2file.seek(0) self.textfile = NamedTemporaryFile(delete=False) - self.textfile.write("bad text file") + self.textfile.write(b'bad text file') self.textfile.seek(0) @@ -67,7 +67,7 @@ def test_booxtream_good(self): # make sure it works with an in-memory file self.epub2file.seek(0) - in_mem_epub = StringIO() + in_mem_epub = BytesIO() in_mem_epub.write(self.epub2file.read()) in_mem_epub.seek(0) boox2 = inst.platform(epubfile=in_mem_epub, **params) diff --git a/celery_module.py b/celery_module.py new file mode 100644 index 000000000..20160ae1c --- /dev/null +++ b/celery_module.py @@ -0,0 +1,20 @@ +from __future__ import absolute_import, unicode_literals + +import os + +from celery import Celery + +# set the default Django settings module for the 'celery' program. +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'regluit.settings') + +app = Celery('regluit') + +# Using a string here means the worker doesn't have to serialize +# the configuration object to child processes. + +app.config_from_object('django.conf:settings') + +# Load task modules from all registered Django app configs. +app.autodiscover_tasks() + + diff --git a/context_processors.py b/context_processors.py index 02db6a844..3c172a98d 100644 --- a/context_processors.py +++ b/context_processors.py @@ -1,12 +1,20 @@ def is_preview(request): - from django.conf import settings - return {'jquery_home': settings.JQUERY_HOME, 'jquery_ui_home': settings.JQUERY_UI_HOME, 'is_preview': settings.IS_PREVIEW, 'show_google_analytics': settings.SHOW_GOOGLE_ANALYTICS} - + from django.conf import settings + return { + 'jquery_home': settings.JQUERY_HOME, + 'jquery_ui_home': settings.JQUERY_UI_HOME, + 'jquery_ui_theme': settings.JQUERY_UI_THEME, + 'is_preview': settings.IS_PREVIEW, + 'show_google_analytics': settings.SHOW_GOOGLE_ANALYTICS + } + def count_unseen(request): - - if request.user.is_anonymous(): - count = 0 - else: - from notification.models import Notice - count = Notice.objects.unseen_count_for(request.user) - return {'unseen_count': count} \ No newline at end of file + try: + if request.user.is_anonymous: + count = 0 + else: + from notification.models import Notice + count = Notice.objects.unseen_count_for(request.user) + except AttributeError: + count = 0 + return {'unseen_count': count} \ No newline at end of file diff --git a/core/admin.py b/core/admin.py index 73706deb9..e365fc706 100644 --- a/core/admin.py +++ b/core/admin.py @@ -3,7 +3,8 @@ # from django import forms from django.contrib.admin import ModelAdmin, register -from django.core.urlresolvers import reverse +from django.urls import reverse +from django.utils.safestring import mark_safe from selectable.forms import ( AutoCompleteSelectWidget, @@ -196,7 +197,7 @@ class Meta(object): class EbookAdmin(ModelAdmin): form = EbookAdminForm search_fields = ('edition__title', '^url') # search by provider using leading url - list_display = ('__unicode__', 'created', 'user', 'edition') + list_display = ('__str__', 'created', 'user', 'edition') date_hierarchy = 'created' ordering = ('edition__title',) readonly_fields = ('user', 'filesize', 'download_count') @@ -226,20 +227,17 @@ class EbookFileAdmin(ModelAdmin): date_hierarchy = 'created' ordering = ('edition__work',) fields = ('file', 'format', 'edition', 'edition_link', 'ebook', 'ebook_link', 'source') - readonly_fields = ('file', 'edition_link', 'ebook_link',) + readonly_fields = ('file', 'edition_link', 'ebook_link', 'source') def edition_link(self, obj): if obj.edition: link = reverse("admin:core_edition_change", args=[obj.edition_id]) - return u'%s' % (link, obj.edition) - return u'' + return mark_safe('%s' % (link, obj.edition)) + return '' def ebook_link(self, obj): if obj.ebook: link = reverse("admin:core_ebook_change", args=[obj.ebook_id]) - return u'%s' % (link, obj.ebook) - return u'' - edition_link.allow_tags = True - ebook_link.allow_tags = True - + return mark_safe('%s' % (link, obj.ebook)) + return '' @register(models.Wishlist) class WishlistAdmin(ModelAdmin): @@ -257,8 +255,7 @@ class GiftAdmin(ModelAdmin): search_fields = ('giver__username', 'to') readonly_fields = ('giver', 'acq',) def acq_admin_link(self, gift): - return "%s" % (gift.acq_id, gift.acq) - acq_admin_link.allow_tags = True + return mark_safe("%s" % (gift.acq_id, gift.acq)) @register(models.CeleryTask) class CeleryTaskAdmin(ModelAdmin): diff --git a/core/bookloader.py b/core/bookloader.py index a9fed0c2f..2baabe052 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -6,7 +6,7 @@ import re from datetime import timedelta from xml.etree import ElementTree -from urlparse import (urljoin, urlparse) +from urllib.parse import (urljoin, urlparse) import requests @@ -14,29 +14,31 @@ # django imports from django.conf import settings -from django.core.files.base import ContentFile from django.core.files.storage import default_storage from django.db import IntegrityError +from django.db.models import Sum from django.forms import ValidationError +from django.utils.timezone import now from django_comments.models import Comment +import github3 from github3 import (login, GitHub) from github3.repos.release import Release -from django.utils.timezone import now from gitenberg.metadata.pandata import Pandata # regluit imports import regluit import regluit.core.isbn -from regluit.core.validation import test_file from regluit.marc.models import inverse_marc_rels +from regluit.utils.lang import lang_to_language_code from . import cc from . import models -from .parameters import WORK_IDENTIFIERS +from .parameters import WORK_IDENTIFIERS, DOWNLOADABLE from .validation import identifier_cleaner, unreverse_name +from .models import loader logger = logging.getLogger(__name__) request_log = logging.getLogger("requests") @@ -58,10 +60,10 @@ def add_by_oclc_from_google(oclc): url = "https://www.googleapis.com/books/v1/volumes" try: results = _get_json(url, {"q": '"OCLC%s"' % oclc}) - except LookupFailure, e: + except LookupFailure as e: logger.exception(u"lookup failure for %s", oclc) return None - if not results.has_key('items') or not results['items']: + if not 'items' in results or not results['items']: logger.warn(u"no google hits for %s", oclc) return None @@ -69,9 +71,9 @@ def add_by_oclc_from_google(oclc): e = add_by_googlebooks_id(results['items'][0]['id'], results=results['items'][0]) models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save() return e - except LookupFailure, e: + except LookupFailure as e: logger.exception(u"failed to add edition for %s", oclc) - except IntegrityError, e: + except IntegrityError as e: logger.exception(u"google books data for %s didn't fit our db", oclc) return None @@ -135,7 +137,7 @@ def get_google_isbn_results(isbn): except LookupFailure: logger.exception(u"lookup failure for %s", isbn) return None - if not results.has_key('items') or not results['items']: + if not 'items' in results or not results['items']: logger.warn(u"no google hits for %s", isbn) return None return results @@ -187,7 +189,7 @@ def update_edition(edition): item = results['items'][0] googlebooks_id = item['id'] d = item['volumeInfo'] - if d.has_key('title'): + if 'title' in d: title = d['title'] else: title = '' @@ -197,7 +199,7 @@ def update_edition(edition): title = edition.work.title # check for language change - language = d['language'] + language = fix_lang(d['language']) # allow variants in main language (e.g., 'zh-tw') if len(language) > 5: language = language[0:5] @@ -206,7 +208,7 @@ def update_edition(edition): # attach edition to the if edition.work.language != language: logger.info(u"reconnecting %s since it is %s instead of %s", - googlebooks_id, language, edition.work.language) + googlebooks_id, language, edition.work.language) old_work = edition.work new_work = models.Work(title=title, language=language) @@ -217,7 +219,7 @@ def update_edition(edition): logger.info(u"moving identifier %s", identifier.value) identifier.work = new_work identifier.save() - if old_work and old_work.editions.count() == 0: + if old_work and not old_work.editions.exists(): #a dangling work; make sure nothing else is attached! merge_works(new_work, old_work) @@ -250,9 +252,10 @@ def get_isbn_item(items, isbn): for ident in industryIdentifiers: if ident['identifier'] == isbn: return item - else: - return None # no items - return item + # no isbn item maybe "other" + for item in items: + return item + def add_by_isbn_from_google(isbn, work=None): """add a book to the UnglueIt database from google based on ISBN. The work parameter @@ -285,9 +288,9 @@ def add_by_isbn_from_google(isbn, work=None): results=item, isbn=isbn ) - except LookupFailure, e: + except LookupFailure as e: logger.exception(u"failed to add edition for %s", isbn) - except IntegrityError, e: + except IntegrityError as e: logger.exception(u"google books data for %s didn't fit our db", isbn) return None return None @@ -306,6 +309,15 @@ def get_edition_by_id(type, value): except models.Identifier.DoesNotExist: return None +def fix_lang(language): + if len(language) > 5: + language = language[0:5] + if language == 'un': + # 5/28/21 language coding is broken in google books + # hope they fix it + language = 'xx' + return language + def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None): """add a book to the UnglueIt database based on the GoogleBooks ID. The @@ -344,12 +356,12 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None): item = _get_json(url) d = item['volumeInfo'] - if d.has_key('title'): + if 'title' in d: title = d['title'] else: title = '' if not title: - # need a title to make an edition record; some crap records in GB. + # need a title to make an edition record; some crap records in GB. # use title from parent if available if work: title = work.title @@ -357,10 +369,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None): return None # don't add the edition to a work with a different language - # https://www.pivotaltracker.com/story/show/17234433 - language = d['language'] - if len(language) > 5: - language = language[0:5] + language = fix_lang(d['language']) if work and work.language != language: logger.info(u"not connecting %s since it is %s instead of %s", googlebooks_id, language, work.language) @@ -488,13 +497,13 @@ def add_related(isbn): logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id) work = merge_works(work, related_edition.work) else: - if other_editions.has_key(related_language): + if related_language in other_editions: other_editions[related_language].append(related_edition) else: other_editions[related_language] = [related_edition] # group the other language editions together - for lang_group in other_editions.itervalues(): + for lang_group in other_editions.values(): logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group]) if len(lang_group) > 1: lang_edition = lang_group[0] @@ -518,7 +527,7 @@ def thingisbn(isbn): which come back as isbn_13') """ logger.info(u"looking up %s at ThingISBN", isbn) - url = "https://www.librarything.com/api/thingISBN/%s" % isbn + url = f"https://www.librarything.com/api/{settings.LIBRARYTHING_KEY}/thingISBN/{isbn}" xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content try: doc = ElementTree.fromstring(xml) @@ -540,7 +549,7 @@ def merge_works(w1, w2, user=None): #don't merge if the works are related. if w2 in w1.works_related_to.all() or w1 in w2.works_related_to.all(): return w1 - + # check if one of the works is a series with parts (that have their own isbn) if w1.works_related_from.filter(relation='part'): models.WorkRelation.objects.get_or_create(to_work=w2, from_work=w1, relation='part') @@ -548,14 +557,20 @@ def merge_works(w1, w2, user=None): if w2.works_related_from.filter(relation='part'): models.WorkRelation.objects.get_or_create(to_work=w1, from_work=w2, relation='part') return w1 - - + if w1.editions.count() > 3 and w2.editions.count() > 3 and not user: + # avoid big merges + return w1 + if w2.selected_edition is not None and w1.selected_edition is None: #the merge should be reversed temp = w1 w1 = w2 w2 = temp - models.WasWork(was=w2.pk, work=w1, user=user).save() + try: + models.WasWork(was=w2.pk, work=w1, user=user).save() + except IntegrityError: + # already a 'was' entry for w2; somehow it was never deleted + pass for ww in models.WasWork.objects.filter(work=w2): ww.work = w1 ww.save() @@ -600,9 +615,6 @@ def merge_works(w1, w2, user=None): for hold in w2.holds.all(): hold.work = w1 hold.save() - for landing in w2.landings.all(): - landing.object_id = w1.id - landing.save() for subject in w2.subjects.all(): if subject not in w1.subjects.all(): w1.subjects.add(subject) @@ -615,21 +627,35 @@ def merge_works(w1, w2, user=None): w2.delete(cascade=False) return w1 -def detach_edition(e): +def detach_editions(eds): """ - will detach edition from its work, creating a new stub work. if remerge=true, will see if - there's another work to attach to + will detach edition from its work, creating a new stub work. """ + if not len(eds): + return + e = eds[0] + from_work = e.work logger.info(u"splitting edition %s from %s", e, e.work) w = models.Work(title=e.title, language=e.work.language) w.save() + for e in eds: + for identifier in e.identifiers.all(): + identifier.work = w + identifier.save() - for identifier in e.identifiers.all(): - identifier.work = w - identifier.save() + e.work = w + e.save() + + models.WorkRelation.objects.get_or_create( + to_work=w, + from_work=from_work, + relation='unspecified', + ) + + frees = models.Work.objects.annotate(free=Sum('editions__ebooks__active')).filter(free__gt=0) + w.is_free = frees.exists() + w.save() - e.work = w - e.save() SPAM_STRINGS = ["GeneralBooksClub.com", "AkashaPublishing.Com"] def despam_description(description): @@ -644,12 +670,14 @@ def despam_description(description): pieces = description.split("a million books for free.") if len(pieces) > 1: return pieces[1] - return description + return description.replace('\r\n', '\n') def add_openlibrary(work, hard_refresh=False): + if not settings.USE_OPENLIBRARY and not settings.DEBUG: + return if (not hard_refresh) and work.openlibrary_lookup is not None: # don't hit OL if we've visited in the past month or so - if now()- work.openlibrary_lookup < timedelta(days=30): + if now()- work.openlibrary_lookup < timedelta(days=90): return work.openlibrary_lookup = now() work.save() @@ -664,7 +692,7 @@ def add_openlibrary(work, hard_refresh=False): url = "https://openlibrary.org/api/books" params = {"format": "json", "jscmd": "details"} subjects = [] - for edition in work.editions.all(): + for edition in work.editions.all()[:10]: isbn_key = "ISBN:%s" % edition.isbn_13 params['bibkeys'] = isbn_key try: @@ -672,9 +700,9 @@ def add_openlibrary(work, hard_refresh=False): except LookupFailure: logger.exception(u"OL lookup failed for %s", isbn_key) e = {} - if e.has_key(isbn_key): - if e[isbn_key].has_key('details'): - if e[isbn_key]['details'].has_key('oclc_numbers'): + if isbn_key in e: + if 'details' in e[isbn_key]: + if 'oclc_numbers' in e[isbn_key]['details']: for oclcnum in e[isbn_key]['details']['oclc_numbers']: models.Identifier.get_or_add( type='oclc', @@ -682,42 +710,36 @@ def add_openlibrary(work, hard_refresh=False): work=work, edition=edition ) - if e[isbn_key]['details'].has_key('identifiers'): + if 'identifiers' in e[isbn_key]['details']: ids = e[isbn_key]['details']['identifiers'] - if ids.has_key('goodreads'): - models.Identifier.get_or_add( - type='gdrd', - value=ids['goodreads'][0], - work=work, edition=edition - ) - if ids.has_key('librarything'): + if 'librarything' in ids: models.Identifier.get_or_add( type='ltwk', value=ids['librarything'][0], work=work ) - if ids.has_key('google'): + if 'google' in ids: models.Identifier.get_or_add( type='goog', value=ids['google'][0], work=work ) - if ids.has_key('project_gutenberg'): + if 'project_gutenberg' in ids: models.Identifier.get_or_add( type='gute', value=ids['project_gutenberg'][0], work=work ) - if e[isbn_key]['details'].has_key('works'): + if 'works' in e[isbn_key]['details']: work_key = e[isbn_key]['details']['works'].pop(0)['key'] logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key) models.Identifier.get_or_add(type='olwk', value=work_key, work=work) try: w = _get_json("https://openlibrary.org" + work_key, type='ol') - if w.has_key('description'): + if 'description' in w: description = w['description'] if isinstance(description, dict): - if description.has_key('value'): + if 'value' in description: description = description['value'] description = despam_description(description) if not work.description or \ @@ -725,7 +747,7 @@ def add_openlibrary(work, hard_refresh=False): len(description) > len(work.description): work.description = description work.save() - if w.has_key('subjects') and len(w['subjects']) > len(subjects): + if 'subjects' in w and len(w['subjects']) > len(subjects): subjects = w['subjects'] except LookupFailure: logger.exception(u"OL lookup failed for %s", work_key) @@ -749,7 +771,10 @@ def _get_json(url, params={}, type='gb'): if type == 'gb': params['key'] = settings.GOOGLE_BOOKS_API_KEY params['country'] = 'us' - response = requests.get(url, params=params, headers=headers) + try: + response = requests.get(url, params=params, headers=headers) + except requests.exceptions.ConnectionError: + raise LookupFailure("GET failed: url=%s and params=%s" % (url, params)) if response.status_code == 200: return json.loads(response.content) else: @@ -840,7 +865,6 @@ def load_from_yaml(yaml_url, test_mode=False): return edition.work_id if edition else None def edition_for_ident(id_type, id_value): - #print 'returning edition for {}: {}'.format(id_type, id_value) for ident in models.Identifier.objects.filter(type=id_type, value=id_value): return ident.edition if ident.edition else ident.work.editions[0] @@ -863,22 +887,6 @@ def edition_for_etype(etype, metadata, default=None): for key in metadata.edition_identifiers.keys(): return edition_for_ident(key, metadata.identifiers[key]) -def load_ebookfile(url, etype): - ''' - return a ContentFile if a new ebook has been loaded - ''' - ebfs = models.EbookFile.objects.filter(source=url) - if ebfs: - return None - try: - r = requests.get(url) - contentfile = ContentFile(r.content) - test_file(contentfile, etype) - return contentfile - except IOError, e: - logger.error(u'could not open {}'.format(url)) - except ValidationError, e: - logger.error(u'downloaded {} was not a valid {}'.format(url, etype)) class BasePandataLoader(object): def __init__(self, url): @@ -909,10 +917,9 @@ def load_from_pandata(self, metadata, work=None): if work and id.work and id.work_id is not work.id: # dangerous! merge newer into older if work.id < id.work_id: - merge_works(work, id.work) + work = merge_works(work, id.work) else: - merge_works(id.work, work) - work = id.work + work = merge_works(id.work, work) else: work = id.work if id.edition and not edition: @@ -923,7 +930,11 @@ def load_from_pandata(self, metadata, work=None): new_ids.append((identifier, id_code, value)) if not work: - work = models.Work.objects.create(title=metadata.title, language=metadata.language) + if metadata.title: + language = lang_to_language_code(metadata.language) + work = models.Work.objects.create(title=metadata.title, language=language if language else 'xx') + else: + return None if not edition: if metadata.edition_note: (note, created) = models.EditionNote.objects.get_or_create(note=metadata.edition_note) @@ -949,11 +960,13 @@ def load_from_pandata(self, metadata, work=None): #be careful about overwriting the work description if metadata.description and len(metadata.description) > len(work.description): + if isinstance(metadata.description, list): + metadata.description = '\n'.join(metadata.description) # don't over-write reasonably long descriptions if len(work.description) < 500: - work.description = metadata.description + work.description = metadata.description.replace('\r\n', '\n') - if metadata.creator and not edition.authors.count(): + if metadata.creator and not edition.authors.exists(): edition.authors.clear() for key in metadata.creator.keys(): creators = metadata.creator[key] @@ -966,7 +979,7 @@ def load_from_pandata(self, metadata, work=None): for yaml_subject in metadata.subjects: #always add yaml subjects (don't clear) if isinstance(yaml_subject, tuple): (authority, heading) = yaml_subject - elif isinstance(yaml_subject, str) or isinstance(yaml_subject, unicode) : + elif isinstance(yaml_subject, str) or isinstance(yaml_subject, str): (authority, heading) = ('', yaml_subject) else: continue @@ -986,16 +999,16 @@ def load_from_pandata(self, metadata, work=None): def load_ebooks(self, metadata, edition, test_mode=False, user=None): default_edition = edition - for key in ['epub', 'pdf', 'mobi']: + license = cc.license_from_cc_url(metadata.rights_url) + for key in DOWNLOADABLE: url = metadata.metadata.get('download_url_{}'.format(key), None) if url: edition = edition_for_etype(key, metadata, default=default_edition) if edition: - contentfile = load_ebookfile(url, key) - if contentfile: + contentfile, fmt = loader.load_ebookfile(url, key) + if contentfile and fmt == key: contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key) path = default_storage.save(contentfile_name, contentfile) - license = cc.license_from_cc_url(metadata.rights_url) ebf = models.EbookFile.objects.create( format=key, edition=edition, @@ -1064,26 +1077,10 @@ def git_download_from_yaml_url(yaml_url, version, edition_name='book', format_=' ''' if yaml_url.endswith('raw/master/metadata.yaml'): repo_url = yaml_url[0:-24] - #print (repo_url,version,edition_name) ebook_url = repo_url + 'releases/download/' + version + '/' + edition_name + '.' + format_ return ebook_url -def release_from_tag(repo, tag_name): - """Get a release by tag name. - release_from_tag() returns a release with specified tag - while release() returns a release with specified release id - :param str tag_name: (required) name of tag - :returns: :class:`Release ` - """ - # release_from_tag adapted from - # https://github.com/sigmavirus24/github3.py/blob/38de787e465bffc63da73d23dc51f50d86dc903d/github3/repos/repo.py#L1781-L1793 - - url = repo._build_url('releases', 'tags', tag_name, - base_url=repo._api) - json_obj = repo._json(repo._get(url), 200) - return Release(json_obj, repo) if json_obj else None - def ebooks_in_github_release(repo_owner, repo_name, tag, token=None): """ returns a list of (book_type, book_name) for a given GitHub release (specified by @@ -1101,24 +1098,29 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None): gh = GitHub() repo = gh.repository(repo_owner, repo_name) - release = release_from_tag(repo, tag) + try: + release = repo.release_from_tag(tag) + return [(EBOOK_FORMATS.get(asset.content_type), asset.name) + for asset in release.assets() + if EBOOK_FORMATS.get(asset.content_type) is not None] + except github3.exceptions.NotFoundError: + logger.error('No rleases available for %s/%s', repo_owner, repo_name) + return [] - return [(EBOOK_FORMATS.get(asset.content_type), asset.name) - for asset in release.iter_assets() - if EBOOK_FORMATS.get(asset.content_type) is not None] def add_from_bookdatas(bookdatas): ''' bookdatas are iterators of scrapers ''' editions = [] for bookdata in bookdatas: edition = work = None - loader = BasePandataLoader(bookdata.base) - pandata = Pandata() - pandata.metadata = bookdata.metadata - for metadata in pandata.get_edition_list(): - edition = loader.load_from_pandata(metadata, work) - work = edition.work - loader.load_ebooks(pandata, edition) - if edition: - editions.append(edition) + if bookdata and bookdata.metadata: + loader = BasePandataLoader(bookdata.base) + pandata = Pandata() + pandata.metadata = bookdata.metadata + for metadata in pandata.get_edition_list(): + edition = loader.load_from_pandata(metadata, work) + work = edition.work + loader.load_ebooks(pandata, edition) + if edition: + editions.append(edition) return editions diff --git a/core/cc.py b/core/cc.py index 333096e29..1429b3e7a 100644 --- a/core/cc.py +++ b/core/cc.py @@ -8,12 +8,12 @@ import re INFO_CC = ( - ('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'), + ('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'), ('CC BY-NC-SA', 'by-nc-sa', 'Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)', 'https://creativecommons.org/licenses/by-nc-sa/3.0/', 'Creative Commons Attribution-NonCommercial-ShareAlike'), ('CC BY-NC', 'by-nc', 'Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0)', 'https://creativecommons.org/licenses/by-nc/3.0/', 'Creative Commons Attribution-NonCommercial'), - ('CC BY-ND', 'by-nd', 'Creative Commons Attribution-NoDerivs 3.0 Unported (CC BY-ND 3.0)', 'https://creativecommons.org/licenses/by-nd/3.0/','Creative Commons Attribution-NoDerivs'), + ('CC BY-ND', 'by-nd', 'Creative Commons Attribution-NoDerivs 3.0 Unported (CC BY-ND 3.0)', 'https://creativecommons.org/licenses/by-nd/3.0/', 'Creative Commons Attribution-NoDerivs'), ('CC BY-SA', 'by-sa', 'Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)', 'https://creativecommons.org/licenses/by-sa/3.0/', 'Creative Commons Attribution-ShareAlike'), - ('CC BY', 'by', 'Creative Commons Attribution 3.0 Unported (CC BY 3.0)', 'https://creativecommons.org/licenses/by/3.0/', 'Creative Commons Attribution'), + ('CC BY', 'by', 'Creative Commons Attribution 3.0 Unported (CC BY 3.0)', 'https://creativecommons.org/licenses/by/3.0/', 'Creative Commons Attribution'), ('CC0', 'cc0', 'No Rights Reserved (CC0)', 'https://creativecommons.org/about/cc0', 'No Rights Reserved (CC0)'), ) INFO_FREE = INFO_CC + ( @@ -28,20 +28,20 @@ # CCHOICES, CCGRANTS, and FORMATS are all used in places that expect tuples # CONTENT_TYPES will be easiest to manipulate in ungluify_record as a dict -CCCHOICES = tuple([(item[0],item[2]) for item in INFO_CC]) -FREECHOICES = tuple([(item[0],item[2]) for item in INFO_FREE]) - -CHOICES = tuple([(item[0],item[4]) for item in INFO_ALL]) +CCCHOICES = tuple([(item[0], item[2]) for item in INFO_CC]) +FREECHOICES = tuple([(item[0], item[2]) for item in INFO_FREE]) -CCGRANTS = tuple([(item[0],item[3]) for item in INFO_CC]) +CHOICES = tuple([(item[0], item[4]) for item in INFO_ALL]) -GRANTS = tuple([(item[0],item[3]) for item in INFO_ALL]) +CCGRANTS = tuple([(item[0], item[3]) for item in INFO_CC]) -LICENSE_LIST = [item[0] for item in INFO_CC] -LICENSE_LIST_ALL = [item[0] for item in INFO_ALL] +GRANTS = tuple([(item[0], item[3]) for item in INFO_ALL]) + +LICENSE_LIST = [item[0] for item in INFO_CC] +LICENSE_LIST_ALL = [item[0] for item in INFO_ALL] LICENSE_NAMES_ALL = [item[2] for item in INFO_ALL] LICENSE_URLS_ALL = [item[3] for item in INFO_ALL] -FACET_LIST = [item[1] for item in INFO_ALL] +FACET_LIST = [item[1] for item in INFO_ALL] RIGHTS_ALIAS = { "Public domain in the USA.":"PD-US", @@ -53,8 +53,7 @@ def url(license): license = RIGHTS_ALIAS.get(license, license) if license in LICENSE_LIST_ALL: return INFO_ALL[LICENSE_LIST_ALL.index(license)][3] - else: - return '' + return '' @staticmethod def badge(license): @@ -80,40 +79,38 @@ def badge(license): return '/static/images/lal.png' elif license == 'OSI': return '/static/images/opensource.png' - else: - return '' + return '' def description(license): - if license == 'PD-US': - return 'Use of this material is not restricted by copyright in the US.' - elif license == 'CC0': - return 'The copyright owner has dedicated the material to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law. You can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission.' - elif license == 'CC BY': - return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; for any purpose, even commercially. Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.' - elif license == 'CC BY-NC-ND': - return 'You are free to: copy and redistribute the material in any medium or format; under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.; you may not use the material for commercial purposes; if you remix, transform, or build upon the material, you may not distribute the modified material.' - elif license == 'CC BY-NC-SA': - return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. You may not use the material for commercial purposes. If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.' - elif license == 'CC BY-NC': - return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. You may not use the material for commercial purposes.' - elif license == 'CC BY-SA': - return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; for any purpose, even commercially. Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.' - elif license == 'CC BY-ND': - return 'You are free to: copy and redistribute the material in any medium or format; for any purpose, even commercially. Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. If you remix, transform, or build upon the material, you may not distribute the modified material.' - elif license == 'GFDL': - return 'The purpose of this License is to make a manual, textbook, or other functional and useful document "free" in the sense of freedom: to assure everyone the effective freedom to copy and redistribute it, with or without modifying it, either commercially or noncommercially. Secondarily, this License preserves for the author and publisher a way to get credit for their work, while not being considered responsible for modifications made by others.' - elif license == 'LAL': - return 'Avec la Licence Art Libre, l\'autorisation est donnée de copier, de diffuser et de transformer librement les œuvres dans le respect des droits de l\'auteur.' - elif license == 'OSI': - return 'Open source licenses are licenses that comply with the Open Source Definition — in brief, they allow software to be freely used, modified, and shared. To be approved by the Open Source Initiative (also known as the OSI), a license must go through the Open Source Initiative\'s license review process.' - else: - return '' + if license == 'PD-US': + return 'Use of this material is not restricted by copyright in the US.' + elif license == 'CC0': + return 'The copyright owner has dedicated the material to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law. You can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission.' + elif license == 'CC BY': + return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; for any purpose, even commercially. Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.' + elif license == 'CC BY-NC-ND': + return 'You are free to: copy and redistribute the material in any medium or format; under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.; you may not use the material for commercial purposes; if you remix, transform, or build upon the material, you may not distribute the modified material.' + elif license == 'CC BY-NC-SA': + return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. You may not use the material for commercial purposes. If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.' + elif license == 'CC BY-NC': + return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. You may not use the material for commercial purposes.' + elif license == 'CC BY-SA': + return 'You are free to: copy and redistribute the material in any medium or format; remix, transform, and build upon the material; for any purpose, even commercially. Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.' + elif license == 'CC BY-ND': + return 'You are free to: copy and redistribute the material in any medium or format; for any purpose, even commercially. Under the following terms: You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. If you remix, transform, or build upon the material, you may not distribute the modified material.' + elif license == 'GFDL': + return 'The purpose of this License is to make a manual, textbook, or other functional and useful document "free" in the sense of freedom: to assure everyone the effective freedom to copy and redistribute it, with or without modifying it, either commercially or noncommercially. Secondarily, this License preserves for the author and publisher a way to get credit for their work, while not being considered responsible for modifications made by others.' + elif license == 'LAL': + return 'Avec la Licence Art Libre, l\'autorisation est donnée de copier, de diffuser et de transformer librement les œuvres dans le respect des droits de l\'auteur.' + elif license == 'OSI': + return 'Open source licenses are licenses that comply with the Open Source Definition — in brief, they allow software to be freely used, modified, and shared. To be approved by the Open Source Initiative (also known as the OSI), a license must go through the Open Source Initiative\'s license review process.' + return '' class ccinfo(object): def __init__(self, license): - value=license_value(license) - self.license=value if value else license - + value = license_value(license) + self.license = value if value else license + @property def description(self): return description(self.license) @@ -127,29 +124,28 @@ def url(self): def full_title(self): if self.license in LICENSE_LIST_ALL: return INFO_ALL[LICENSE_LIST_ALL.index(self.license)][2] - else: - return self.license + return self.license @property def title(self): if self.license in LICENSE_LIST_ALL: return INFO_ALL[LICENSE_LIST_ALL.index(self.license)][4] - else: - return self.license + return self.license @property def is_cc(self): return self.license in LICENSE_LIST @property def is_pd(self): return self.license == 'PD-US' - + def __str__(self): return self.license def license_value(facet): if facet in FACET_LIST: return LICENSE_LIST_ALL[FACET_LIST.index(facet)] - else: - return '' + return '' + +MATCH_CC_LICENSE = re.compile(r' BY(-NC)?(-ND|-SA)? ') def match_license(license_string): if license_string in LICENSE_LIST_ALL: @@ -164,6 +160,9 @@ def match_license(license_string): return INFO_ALL[l][0] except ValueError: pass + lic = MATCH_CC_LICENSE.search(license_string) + if lic: + return 'CC{}'.format(lic.group(0).upper()).strip() return RIGHTS_ALIAS.get(license_string, None) MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/') @@ -172,8 +171,9 @@ def license_from_cc_url(rights_url): return None lic = MATCH_LICENSE.search(rights_url) if lic: - return 'CC {}'.format(lic.group(1).upper()) + license_string = 'CC {}'.format(lic.group(1).upper()) + if license_string in LICENSE_LIST_ALL: + return license_string if rights_url.find('openedition.org') >= 0: return 'OPENEDITION' return '' - diff --git a/core/covers.py b/core/covers.py new file mode 100644 index 000000000..143f31f12 --- /dev/null +++ b/core/covers.py @@ -0,0 +1,128 @@ +""" handle caching and thumbnailing of covers """ + +import logging + +from django.utils.functional import LazyObject + +import sorl + +from sorl.thumbnail import get_thumbnail as sorl_get_thumbnail + +from sorl.thumbnail.base import ThumbnailBackend +from sorl.thumbnail.conf import settings, defaults as default_settings +from sorl.thumbnail.helpers import get_module_class +from sorl.thumbnail.images import BaseImageFile, ImageFile +from sorl.thumbnail import default + +from celery.utils.log import get_logger +celerylogger = get_logger(__name__) + +import regluit + +logger = logging.getLogger(__name__) + +DEFAULT_COVER_LARGE = '/static/images/generic_cover_full.png' +DEFAULT_COVER = '/static/images/generic_cover_larger.png' +DEFAULT_COVER_SMALL = '/static/images/generic_cover_thumb.png' + +_storage = None + +class Storage(LazyObject): + ''' + Monkey patch to fix S3 backend slowness in sorl.thumbnail + https://github.com/jazzband/sorl-thumbnail/issues/301 + ''' + def _setup(self): + global _storage + if not _storage: + _storage = get_module_class(settings.THUMBNAIL_STORAGE)() + + self._wrapped = _storage + +sorl.thumbnail.default.storage = Storage() + + +class DefaultImageFile(BaseImageFile): + is_default = True + + def __init__(self, geometry_string='x550'): + if geometry_string == '128': + self._url = DEFAULT_COVER + self.size = (131, 192) + elif geometry_string == 'x80': + self._url = DEFAULT_COVER_SMALL + self.size = (55, 80) + else: + self._url = DEFAULT_COVER_LARGE + self.size = (376, 550) + + @property + def url(self): + return self._url + + def exists(self): + return True + +class ReadOnlyThumbnailBackend(ThumbnailBackend): + """ + A backend that never makes a new thumbnail, but adds missing thumbnails to a task queue + """ + + def get_thumbnail(self, file_, geometry_string, **options): + """ + Returns thumbnail as an ImageFile instance for file with geometry and + options given. It will try to get it from the key value store, + otherwise return a Dummy. + """ + logger.debug('Getting thumbnail for file [%s] at [%s]', file_, geometry_string) + + if file_: + source = ImageFile(file_) + else: + raise ValueError('falsey file_ argument in get_thumbnail()') + + # preserve image filetype + if settings.THUMBNAIL_PRESERVE_FORMAT: + options.setdefault('format', self._get_format(source)) + + for key, value in self.default_options.items(): + options.setdefault(key, value) + + for key, attr in self.extra_options: + value = getattr(settings, attr) + if value != getattr(default_settings, attr): + options.setdefault(key, value) + + name = self._get_thumbnail_filename(source, geometry_string, options) + thumbnail = ImageFile(name, default.storage) + cached = default.kvstore.get(thumbnail) + + if cached: + setattr(cached, 'is_default', False) + return cached + + logger.info('tasking a new thumbnail for %s, %s', file_, geometry_string) + args = [file_, geometry_string] + try: + regluit.core.tasks.make_cover_thumbnail.apply_async( + args=args, kwargs=options, retry=False) + except regluit.core.tasks.make_cover_thumbnail.OperationalError as exc: + logger.error('failed new thumbnail for %s, %s', file_, geometry_string) + celerylogger.exception('Sending task raised: %r', exc) + return DefaultImageFile(geometry_string) + + +backend = ReadOnlyThumbnailBackend() +get_thumbnail = backend.get_thumbnail + +def make_cover_thumbnail(url, geometry_string, **options): + try: + im = sorl_get_thumbnail(url, geometry_string, **options) + except (IOError, OSError): + logger.error('couldnt make thumbnail for %s, probably UnidentifiedImageError', url) + return False + + if im.exists(): + return True + logger.error('couldnt make thumbnail for %s, got non-existent im', url) + return False diff --git a/core/epub.py b/core/epub.py index 27dc44ce2..7aa71a95c 100644 --- a/core/epub.py +++ b/core/epub.py @@ -1,27 +1,26 @@ """ Utilities that manipulate epub files """ - -from regluit.pyepub import EPUB, InvalidEpub -from StringIO import StringIO +from io import StringIO, BytesIO +from pyepub import EPUB, InvalidEpub from django.template.loader import render_to_string def personalize(epub_file, acq): output = EPUB(epub_file, "a") context={'acq':acq} - part = StringIO(unicode(render_to_string('epub/datedcc_license.xhtml', context))) + part = StringIO(str(render_to_string('epub/datedcc_license.xhtml', context))) output.addpart(part, "datedcc_license.xhtml", "application/xhtml+xml", 1) #after title, we hope output.addmetadata('rights','%s after %s'%(acq.work.last_campaign().license_url,acq.work.last_campaign().cc_date)) - personalized_epub= StringIO() + personalized_epub= BytesIO() output.writetodisk(personalized_epub) #logger.info("personalized") return personalized_epub def ask_epub(epub_file, context): output = EPUB(epub_file, "a") - part = StringIO(unicode(render_to_string('epub/ask.xhtml', context))) + part = StringIO(str(render_to_string('epub/ask.xhtml', context))) output.addpart(part, "ask.xhtml", "application/xhtml+xml", 1) #after title, we hope - asking_epub= StringIO() + asking_epub = BytesIO() output.writetodisk(asking_epub) return asking_epub @@ -29,7 +28,7 @@ def ask_epub(epub_file, context): def ungluify(epub_file, campaign): output = EPUB(epub_file, "a") context={'campaign':campaign} - part = StringIO(unicode(render_to_string('epub/cc_license.xhtml', context))) + part = StringIO(str(render_to_string('epub/cc_license.xhtml', context))) output.addpart(part, "cc_license.xhtml", "application/xhtml+xml", 1) #after title, we hope output.addmetadata('rights', campaign.license_url) output.close() diff --git a/core/facets.py b/core/facets.py index c282455c1..52e760ea7 100644 --- a/core/facets.py +++ b/core/facets.py @@ -2,6 +2,7 @@ from django.contrib.auth.models import User from django.db.models import Q from regluit.core import cc +from regluit.core.parameters import MAX_FACETS, ORDER_BY_KEYS class BaseFacet(object): facet_name = 'all' @@ -23,18 +24,18 @@ def _filter_model(self, model, query_set): else: return query_set - def __unicode__(self): + def __str__(self): if self.facet_name == 'all': return 'Free eBooks' - return unicode(self.facet_name) + return str(self.facet_name) @property def title(self): - return self.__unicode__() + return self.__str__() @property def label(self): - return self.__unicode__() + return self.__str__() def get_query_set(self): return self._get_query_set() @@ -66,10 +67,25 @@ def template(self): _stash_others = None def get_other_groups(self): + used = self.facets() + if len(used) >= MAX_FACETS: + # don't show more facets + return [] + + # Subjects (457K values) × other facets = hundreds of millions of + # crawlable URLs that bots exploit. See #1110. + # Rule: keyword facets cannot combine with any other facet type. + has_keyword = any(f.facet_name.startswith('kw.') for f in used) + if has_keyword: + # keyword active → no further facets allowed + return [] + + has_non_base_facet = any(f.facet_name != 'all' for f in used) + if self._stash_others != None: return self._stash_others + others = [] - used = self.facets() for group in facet_groups: in_use = False for facet in used: @@ -77,13 +93,16 @@ def get_other_groups(self): in_use = True break if not in_use: + # If a non-keyword facet is already active, exclude keywords + if has_non_base_facet and isinstance(group, KeywordFacetGroup): + continue others.append(group) - self._stash_others=others + self._stash_others = others return others @property def description(self): - return self.__unicode__() + return self.__str__() class FacetGroup(object): # a FacetGroup should implement title, facets, has_facet(self, facet_name) and get_facet_class(self, facet_name) @@ -106,7 +125,7 @@ class FormatFacetGroup(FacetGroup): def __init__(self): super(FacetGroup,self).__init__() self.title = 'Format' - self.facets = ['pdf', 'epub', 'mobi'] + self.facets = ['pdf', 'epub'] self.label = '{} is ...'.format(self.title) def get_facet_class(self, facet_name): @@ -268,10 +287,10 @@ def set_name(self): self.term=self.facet_name[2:] def get_query_set(self): return self._get_query_set().filter( - Q(title__icontains=self.term) | - Q(editions__authors__name__icontains=self.term) | + Q(title__icontains=self.term) | + Q(editions__authors__name__icontains=self.term) | Q(subjects__name__iexact=self.term) - ) + ) def template(self): return 'facets/search.html' @@ -385,22 +404,27 @@ def get_all_facets(group='all'): facets = facets + facet_group.facets return facets +class InvalidFacetCombination(Exception): + """Raised when a keyword facet is combined with other facets (#1110).""" + pass + def get_facet_object(facet_path): facets = facet_path.replace('//','/').strip('/').split('/') + if len(facets) > 1: + # `all` is a compatibility alias for the base facet, not a real facet. + facets = [facet for facet in facets if facet and facet != 'all'] + # Block keyword + other facet compounds (#1110) + # 457K subjects × other facets = hundreds of millions of bot-crawlable URLs + real_facets = [f for f in facets if f] + if len(real_facets) > 1: + has_keyword = any(f.startswith('kw.') for f in real_facets) + if has_keyword: + raise InvalidFacetCombination(facet_path) facet_object = None - for facet in facets: + for facet in facets[:MAX_FACETS]: facet_object = get_facet(facet)(facet_object) - return facet_object + return facet_object if facet_object else BaseFacet(None) -order_by_keys = { - 'newest':['-featured', '-created'], - 'oldest':['created'], - 'featured':['-featured', '-num_wishes'], - 'popular':['-num_wishes'], - 'title':['title'], - 'none':[], #no ordering -} - def get_order_by(order_by_key): # return the args to use as arguments for order_by - return order_by_keys.get(order_by_key,'') \ No newline at end of file + return ORDER_BY_KEYS.get(order_by_key,'') diff --git a/core/fixtures/basic_campaign_test.json b/core/fixtures/basic_campaign_test.json index cdc498a22..f9ba28a59 100644 --- a/core/fixtures/basic_campaign_test.json +++ b/core/fixtures/basic_campaign_test.json @@ -48,7 +48,7 @@ "groups": [], "user_permissions": [], "password": "pbkdf2_sha256$10000$sILqnpDfTw8Z$djqLomeFeVJIEEqAbp+YqXVOVKI0onS6OwJvpiTEe2g=", - "email": "raymond.yee2@example.org", + "email": "openurl@example.org", "date_joined": "2012-10-14T10:03:43" } }, @@ -56,16 +56,9 @@ "pk": 1, "model": "core.userprofile", "fields": { - "goodreads_auth_secret": null, - "goodreads_user_name": null, "created": "2012-10-12T22:58:33", "tagline": "", - "twitter_id": "", - "goodreads_user_id": null, - "goodreads_auth_token": null, - "goodreads_user_link": null, "user": 1, - "facebook_id": null, "librarything_id": "", "home_url": "", "pic_url": "", @@ -76,16 +69,9 @@ "pk": 2, "model": "core.userprofile", "fields": { - "goodreads_auth_secret": null, - "goodreads_user_name": null, "created": "2012-10-14T09:57:15", "tagline": "", - "twitter_id": "", - "goodreads_user_id": null, - "goodreads_auth_token": null, - "goodreads_user_link": null, "user": 2, - "facebook_id": null, "librarything_id": "", "home_url": "", "pic_url": "", @@ -96,16 +82,9 @@ "pk": 3, "model": "core.userprofile", "fields": { - "goodreads_auth_secret": null, - "goodreads_user_name": null, "created": "2012-10-14T10:03:43", "tagline": "", - "twitter_id": "", - "goodreads_user_id": null, - "goodreads_auth_token": null, - "goodreads_user_link": null, "user": 3, - "facebook_id": null, "librarything_id": "", "home_url": "", "pic_url": "", @@ -549,7 +528,7 @@ "name": "Moby Dick", "edition": null, "amazon_receiver": "", - "deadline": "2023-04-12T23:59:00", + "deadline": "2033-04-12T23:59:00", "details": "

    \r\n\tThe book is already in the public domain, but let's do this again.

    \r\n", "left": "15000.00", "target": "15000.00" diff --git a/core/goodreads.py b/core/goodreads.py deleted file mode 100644 index 6b1e4c988..000000000 --- a/core/goodreads.py +++ /dev/null @@ -1,320 +0,0 @@ -""" -external library imports -""" -import httplib -import json -import logging -import oauth2 as oauth -import re - -from itertools import islice -from requests import request -from urllib import urlencode -from urlparse import urlparse, urlunparse, urljoin -from xml.etree import ElementTree as ET - -""" -django imports -""" -import django.utils.encoding - -""" -regluit imports -""" -import regluit.core -from regluit.core import bookloader, models - -# import parse_qsl from cgi if it doesn't exist in urlparse -try: - from urlparse import parse_qsl -except: - from cgi import parse_qsl - -from django.conf import settings - -logger = logging.getLogger(__name__) - -# QUESTION: should the request_token, access_token be part of the state of the client? -# for simplicity for now, I will make them part of the state of GoodReadsClient - -class GoodreadsException(Exception): - pass - -class GoodreadsAuthorizationRequired(GoodreadsException): - pass - -def filter_none(d): - d2 = {} - for (k,v) in d.iteritems(): - if v is not None: - d2[k] = v - return d2 - -def safe_strip(a_string): - try: - return a_string.strip() - except: - return '' - -class GoodreadsClient(object): - - url = 'https://www.goodreads.com' - request_token_url = urljoin(url,'oauth/request_token') - authorize_url = urljoin(url, '/oauth/authorize') - access_token_url = urljoin(url,'/oauth/access_token') - - def __init__(self,key,secret,user=None, access_token=None): - self.key = key - self.secret = secret - self.consumer = oauth.Consumer(key=self.key, - secret=self.secret) - - self.client = oauth.Client(self.consumer) - #self.unauth_client = None - - if access_token is not None: - self.__load_access_token(access_token) - else: - self.access_token = None - - if user is not None: - self.load_user_access_token(user) - - @property - def is_authorized(self): - return (self.access_token is not None) - - def begin_authorization (self, callback_url=None): - # get request token - response, content = self.client.request(GoodreadsClient.request_token_url, 'GET') - - if int(response['status']) != httplib.OK: - raise Exception('Invalid response: %s' % response['status']) - - request_token = dict(parse_qsl(content)) - - q = {'oauth_token':request_token['oauth_token']} - if callback_url is not None: - q['oauth_callback'] = callback_url - - authorize_link = GoodreadsClient.authorize_url + '?' + urlencode(q) - return (authorize_link, request_token) - - def complete_authorization(self, request_token): - token = oauth.Token(request_token['oauth_token'], - request_token['oauth_token_secret']) - - self.client = oauth.Client(self.consumer, token) - response, content = self.client.request(GoodreadsClient.access_token_url, 'POST') - if int(response['status']) != httplib.OK: - raise Exception('Invalid response: %s' % response['status']) - - access_token_raw = dict(parse_qsl(content)) - self.__load_access_token(access_token_raw) - return access_token_raw - - def load_user_access_token(self,user): - access_token = {'oauth_token':user.profile.goodreads_auth_token, - 'oauth_token_secret':user.profile.goodreads_auth_secret} - self.__load_access_token(access_token) - - def __load_access_token(self, access_token): - token = oauth.Token(access_token['oauth_token'], - access_token['oauth_token_secret']) - self.access_token = token - self.client = oauth.Client(self.consumer, self.access_token) - - def __clear_access_token(self): - self.access_token = None - self.consumer = oauth.Consumer(key=self.key, - secret=self.secret) - - def auth_user(self): - if self.is_authorized: - response, content = self.client.request('%s/api/auth_user' % GoodreadsClient.url, - 'GET') - if int(response['status']) != httplib.OK: - raise GoodreadsException('Error authenticating Goodreads user ' ) - else: - doc = ET.fromstring(content) - user = doc.find('user') - userid = user.get('id') - name = user.find('name').text - link = user.find('link').text - return({'userid':userid, 'name':name, 'link':link}) - else: - raise GoodreadsAuthorizationRequired('Attempt to access auth_user without authorization.') - - def add_book(self, book_id=871441, shelf_name='to-read'): - # the book is: "Moby-Dick: A Pop-Up Book" 871441 - body = urlencode({'name': 'to-read', 'book_id': book_id}) - headers = {'content-type': 'application/x-www-form-urlencoded'} - response, content = self.client.request('%s/shelf/add_to_shelf.xml' % GoodreadsClient.url, - 'POST', body, headers) - # check that the new resource has been created - if int(response['status']) != httplib.CREATED: - raise GoodreadsException('Cannot create resource: %s' % response['status']) - logger.info('response,content: %s | %s ' % (response,content)) - else: - return True - - def review_list_unauth(self, user_id, shelf='all',page=1,sort=None,per_page=20,order='a',search=None,v=2): - path="/review/list.xml" - method = "GET" - params = filter_none({'id':user_id,'shelf':shelf,'page':page,'sort':sort,'per_page':per_page,'order':order, - 'search':search, 'v':2}) - params["key"] = self.key - - request_url = urljoin(GoodreadsClient.url, path) - logger.info("request_url:{0}, params: {1}".format(request_url, params)) - - more_pages = True - - while (more_pages): - - r = request(method,request_url,params=params) - # print request_url, params - if r.status_code != httplib.OK: - raise GoodreadsException('Error in review_list_unauth, http status_code: {0}'.format(r.status_code)) - else: - doc = ET.fromstring(r.content) - # for the moment convert to a iterable of book data presented as dict -- one the way to paging through all results - reviews = doc.findall('reviews/review') - for review in reviews: - yield ({'id':review.find('id').text, - 'book': {'id': safe_strip(review.find('book/id').text), - 'isbn10': review.find('book/isbn').text, - 'isbn13': review.find('book/isbn13').text, - 'title': safe_strip(review.find('book/title').text), - 'text_reviews_count': safe_strip(review.find('book/text_reviews_count').text), - 'link': safe_strip(review.find('book/link').text), - 'small_image_url': safe_strip(review.find('book/small_image_url').text), - 'ratings_count': safe_strip(review.find('book/ratings_count').text), - 'description': safe_strip(review.find('book/description').text)} - }) - if len(reviews) == 0: - more_pages = False - else: - params["page"] += 1 - - - def review_list(self, user_id, shelf='all',page=1,sort=None,per_page=20,order='a',search=None,v=2): - """have to account for situation in which we might need authorized access - for now: assume no need for auth - sort: available_for_swap, position, num_pages, votes, recommender, rating, shelves, format, - avg_rating, date_pub, isbn, comments, author, title, notes, cover, isbn13, review, date_pub_edition, - condition, asin, date_started, owned, random, date_read, year_pub, read_count, date_added, - date_purchased, num_ratings, purchase_location, date_updated (optional) - """ - - path="/review/list.xml" - method = "GET" - params = filter_none({'id':user_id,'shelf':shelf,'page':page,'sort':sort,'per_page':per_page,'order':order, - 'search':search, 'v':2}) - - request_url = urljoin(GoodreadsClient.url, path) - - more_pages = True - - while (more_pages): - - response, content = self.client.request('%s?%s' % (request_url, urlencode(params)), - method) - if int(response['status']) != httplib.OK: - raise GoodreadsException('Error in review_list: ' ) - else: - #logger.info(' %s' % (content)) - doc = ET.fromstring(content) - # for the moment convert to a iterable of book data presented as dict -- one the way to paging through all results - reviews = doc.findall('reviews/review') - for review in reviews: - yield ({'id':review.find('id').text, - 'book': {'id': safe_strip(review.find('book/id').text), - 'isbn10':review.find('book/isbn').text, - 'isbn13':review.find('book/isbn13').text, - 'title':safe_strip(review.find('book/title').text), - 'text_reviews_count':safe_strip(review.find('book/text_reviews_count').text), - 'link':safe_strip(review.find('book/link').text), - 'small_image_url':safe_strip(review.find('book/small_image_url').text), - 'ratings_count':safe_strip(review.find('book/ratings_count').text), - 'description':safe_strip(review.find('book/description').text)} - }) - if len(reviews) == 0: - more_pages = False - else: - params["page"] += 1 - - def shelves_list(self,user_id,page=1): - """BUG to fix: should go through all the pages, not just page 1 - """ - path = "/shelf/list.xml" - params = {'user_id':user_id, 'page':page} - params["key"] = self.key - method = "GET" - request_url = urljoin(GoodreadsClient.url, path) - - r = request(method,request_url,params=params) - - if r.status_code != httplib.OK: - raise GoodreadsException('Error in shelves_list: %s ' % (r.headers)) - else: - logger.info('headers: %s' % (r.headers)) - doc = ET.fromstring(r.content) - shelves = doc.find('shelves') - # do a simple parsing to a dictionary - - d = dict( [ (k,int(shelves.attrib[k])) for k in shelves.attrib ] ) - d["user_shelves"] = [{'name':shelf.find('name').text, - 'book_count':int(shelf.find('book_count').text), - 'description':shelf.find('description').text if shelf.find('description').attrib['nil'] != 'true' else None, - 'exclusive_flag':shelf.find('exclusive_flag').text} \ - for shelf in shelves.findall('user_shelf')] - - d["total_book_count"] = sum([shelf['book_count'] if shelf['exclusive_flag'] == 'true' else 0 for shelf in d["user_shelves"]]) - return d - - -def load_goodreads_shelf_into_wishlist(user, shelf_name='all', goodreads_user_id=None, max_books=None, expected_number_of_books=None): - """ - Load a specified Goodreads shelf (by default: all the books from the Goodreads account associated with user) - """ - - logger.info('Entering load_goodreads_shelf_into_wishlist. user: %s, shelf_name: %s, goodreads_user_id: %s, max_books: %s, expected_number_of_books: %s', - user, shelf_name, goodreads_user_id, max_books, expected_number_of_books) - gc = GoodreadsClient(key=settings.GOODREADS_API_KEY, secret=settings.GOODREADS_API_SECRET, user=user) - - if goodreads_user_id is None: - if user.profile.goodreads_user_id is not None: - goodreads_user_id = user.profile.goodreads_user_id - else: - raise Exception("No Goodreads user_id is associated with user.") - - logger.info('computed goodreads_user_id: %s ', goodreads_user_id) - - for (i, review) in enumerate(islice(gc.review_list(goodreads_user_id,shelf=shelf_name),max_books)): - isbn = review["book"]["isbn10"] if review["book"]["isbn10"] is not None else review["book"]["isbn13"] - logger.info("%d %s %s %s ", i, review["book"]["title"], isbn, review["book"]["small_image_url"]) - try: - edition = bookloader.add_by_isbn(isbn) - if not edition: - continue - # save the goodreads id since we know it at this point - # we need to extract it from the link since review['id'] - # is the id for a users review, not the book - link = review['book']['link'] - match = re.search('/show/(\d+)', link) - if match: - identifier= models.Identifier.get_or_add(type = 'gdrd', value = match.group(1), edition = edition, work = edition.work) - user.wishlist.add_work(edition.work, 'goodreads', notify=True) - logger.info("Work with isbn %s added to wishlist.", isbn) - else: - logger.error("unable to extract goodreads id from %s", link) - if edition.new: - regluit.core.tasks.populate_edition.delay(edition.isbn_13) - - except Exception, e: - logger.info ("Exception adding ISBN %s: %s", isbn, e) - - logger.info('Leaving load_goodreads_shelf_into_wishlist. Length of wishlist for user %s is %s', user, len(user.wishlist.works.all())) - - return user.wishlist diff --git a/core/isbn.py b/core/isbn.py index 9661cf6d2..0d1586de2 100644 --- a/core/isbn.py +++ b/core/isbn.py @@ -165,8 +165,8 @@ def to_string(self, type='13', hyphenate=False): return "%s-%s-%s-%s-%s" % (s[0:3], s[3], s[4:7], s[7:12], s[12]) else: return self.__isbn13 - def __unicode__(self): - return unicode(self.to_string(type=self.type, hyphenate=False)) + def __str__(self): + return str(self.to_string(type=self.type, hyphenate=False)) def __str__(self): s = self.to_string(type=self.type, hyphenate=False) if s is not None: diff --git a/core/librarything.py b/core/librarything.py index ba38f1fe3..57141f050 100644 --- a/core/librarything.py +++ b/core/librarything.py @@ -1,6 +1,6 @@ import csv -import HTMLParser -import httplib +from html import parser as HTMLParser +import http.client as httplib import logging import re from datetime import datetime @@ -140,7 +140,7 @@ def viewstyle_5(self, rows): # lc classification try: book_data["lc_call_number"] = cols[2].xpath('.//span')[0].text - except Exception, e: + except Exception as e: logger.info("no lc call number for: %s %s", book_data["title"], e) book_data["lc_call_number"] = None @@ -156,7 +156,7 @@ def viewstyle_5(self, rows): # check for   if book_data["isbn"] == u'\xA0': book_data["isbn"] = None - except Exception, e: + except Exception as e: book_data["isbn"] = None yield book_data @@ -203,7 +203,7 @@ def parse_user_catalog(self, view_style=1): count_text = etree.xpath('//td[@class="pbGroup"]')[0].text total = int(re.search(r'(\d+)$', count_text).group(1)) logger.info('total: %d', total) - except Exception, e: + except Exception as e: # assume for now that if we can't grab this text, # there is no page bar and no books logger.info('Exception {0}'.format(e)) @@ -277,5 +277,5 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None): if edition.new: tasks.populate_edition.delay(edition.isbn_13) logger.info("Work with isbn %s added to wishlist.", isbn) - except Exception, e: + except Exception as e: logger.info("error adding ISBN %s: %s", isbn, e) diff --git a/core/loaders/__init__.py b/core/loaders/__init__.py index e47ca29ba..0adc6762e 100755 --- a/core/loaders/__init__.py +++ b/core/loaders/__init__.py @@ -1,3 +1,5 @@ +import logging +from ssl import SSLError import requests from bs4 import BeautifulSoup @@ -9,9 +11,12 @@ from .scrape import BaseScraper from .hathitrust import HathitrustScraper from .pressbooks import PressbooksScraper +from .routledge import RoutledgeScraper from .springer import SpringerScraper -from .ubiquity import UbiquityScraper from .smashwords import SmashwordsScraper +from .ubiquity import UbiquityScraper + +logger = logging.getLogger(__name__) def get_scraper(url): scrapers = [ @@ -20,6 +25,7 @@ def get_scraper(url): UbiquityScraper, SmashwordsScraper, HathitrustScraper, + RoutledgeScraper, BaseScraper, ] for scraper in scrapers: @@ -29,32 +35,43 @@ def get_scraper(url): def scrape_sitemap(url, maxnum=None): try: response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) - doc = BeautifulSoup(response.content, 'lxml') + doc = BeautifulSoup(response.content, 'xml') for page in doc.find_all('loc')[0:maxnum]: scraper = get_scraper(page.text) if scraper.metadata.get('genre', None) == 'book': yield scraper except requests.exceptions.RequestException as e: logger.error(e) + except SSLError as e: + logger.error(e) -def add_by_webpage(url, work=None, user=None): - edition = None - scraper = get_scraper(url) - loader = BasePandataLoader(url) +def add_by_metadata(metadata, url='', work=None, user=None): pandata = Pandata() - pandata.metadata = scraper.metadata + loader = BasePandataLoader(url) + pandata.metadata = metadata for metadata in pandata.get_edition_list(): edition = loader.load_from_pandata(metadata, work) - work = edition.work + if hasattr(edition, 'work'): + work = edition.work + else: + return None loader.load_ebooks(pandata, edition, user=user) return edition if edition else None - +def add_by_webpage(url, work=None, user=None): + if not url: + return None + edition = None + scraper = get_scraper(url) + return add_by_metadata(scraper.metadata, url=url, work=None, user=None) + + def add_by_sitemap(url, maxnum=None): return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum)) def scrape_language(url): scraper = get_scraper(url) - return scraper.metadata.get('language') + language = scraper.metadata.get('language') + return language if language else 'xx' diff --git a/core/loaders/doab.py b/core/loaders/doab.py index 6d3643286..d9f95b5aa 100644 --- a/core/loaders/doab.py +++ b/core/loaders/doab.py @@ -1,30 +1,34 @@ #!/usr/bin/env python # encoding: utf-8 import datetime -import json import logging import re +import urllib.error import requests +from io import BytesIO +from PIL import Image, UnidentifiedImageError + +from django.conf import settings from django.db.models import Q from django.core.files.base import ContentFile from django.core.files.storage import default_storage from oaipmh.client import Client -from oaipmh.error import IdDoesNotExistError -from oaipmh.metadata import MetadataRegistry, oai_dc_reader +from oaipmh.error import IdDoesNotExistError, NoRecordsMatchError +from oaipmh.metadata import MetadataRegistry from regluit.core import bookloader, cc from regluit.core import models, tasks from regluit.core.bookloader import merge_works -from regluit.core.isbn import ISBN -from regluit.core.loaders.utils import type_for_url -from regluit.core.validation import identifier_cleaner, valid_subject +from regluit.core.models.loader import type_for_url +from regluit.core.validation import identifier_cleaner, valid_subject, explode_bics from . import scrape_language -from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider +from .doab_utils import ( + doab_lang_to_iso_639_1, doab_cover, doab_reader, online_to_download, STOREPROVIDERS) logger = logging.getLogger(__name__) @@ -33,7 +37,6 @@ def unlist(alist): return None return alist[0] - SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U) SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg' def store_doab_cover(doab_id, redo=False): @@ -41,8 +44,10 @@ def store_doab_cover(doab_id, redo=False): """ returns tuple: 1) cover URL, 2) whether newly created (boolean) """ + if not doab_id: + return (None, False) - cover_file_name = '/doab/%s/cover' % (doab_id) + cover_file_name = '/doab/%s' % doab_id # if we don't want to redo and the cover exists, return the URL of the cover @@ -50,28 +55,47 @@ def store_doab_cover(doab_id, redo=False): return (default_storage.url(cover_file_name), False) # download cover image to cover_file - url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id) + url = doab_cover(doab_id) + headers = {"User-Agent": settings.USER_AGENT} + if not url: + return (None, False) try: - r = requests.get(url, allow_redirects=False) # requests doesn't handle ftp redirects. + r = requests.get(url, allow_redirects=False, headers=headers, timeout=(5, 60)) # requests doesn't handle ftp redirects. if r.status_code == 302: redirurl = r.headers['Location'] if redirurl.startswith(u'ftp'): springerftp = SPRINGER_COVER.match(redirurl) if springerftp: - redirurl = SPRINGER_IMAGE.format(springerftp.groups(1)) - r = requests.get(redirurl) + redirurl = SPRINGER_IMAGE.format(springerftp.group(1)) + r = requests.get(redirurl, headers=headers, timeout=(5, 60)) else: - r = requests.get(url) - else: - r = requests.get(url) + r = requests.get(redirurl, headers=headers, timeout=(5, 60)) + if not r.content: + logger.warning('No image content for doab_id=%s', doab_id) + return (None, False) + + #test that cover is good + image_bytes = BytesIO(r.content) + try: + image = Image.open(image_bytes) + except UnidentifiedImageError: + logger.warning(f'No image found for {doab_id}') + return (None, False) + cover_file = ContentFile(r.content) - cover_file.content_type = r.headers.get('content-type', '') + content_type = r.headers.get('content-type', '') + if not 'image/' in content_type: + logger.warning('Non-image returned for doab_id=%s', doab_id) + return (None, False) + cover_file.content_type = content_type + + default_storage.save(cover_file_name, cover_file) return (default_storage.url(cover_file_name), True) - except Exception, e: + except Exception as e: # if there is a problem, return None for cover URL - logger.warning('Failed to make cover image for doab_id={}: {}'.format(doab_id, e)) + logger.warning('Failed to make cover image for doab_id=%s: %s', doab_id, e) return (None, False) def update_cover_doab(doab_id, edition, store_cover=True, redo=True): @@ -82,16 +106,23 @@ def update_cover_doab(doab_id, edition, store_cover=True, redo=True): if store_cover: (cover_url, new_cover) = store_doab_cover(doab_id, redo=redo) else: - cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id) + cover_url = doab_cover(doab_id) if cover_url is not None: edition.cover_image = cover_url edition.save() + good = edition.cover_image_small() and edition.cover_image_thumbnail() + if not good: + # oh well + logger.warning("Couldn't make thumbnails for %s using %s", doab_id, cover_url) + edition.cover_image = None + edition.save() return cover_url return None def attach_more_doab_metadata(edition, description, subjects, - publication_date, publisher_name=None, language=None, authors=u''): + publication_date, publisher_name=None, language=None, + dois=None, authors=None, editors=None): """ for given edition, attach description, subjects, publication date to @@ -109,10 +140,11 @@ def attach_more_doab_metadata(edition, description, subjects, # attach description to work if it's not empty work = edition.work - if not work.description: - work.description = description + if description and not work.description: + work.description = description.replace('\r\n', '\n') # update subjects + subjects = explode_bics(subjects) for s in subjects: if valid_subject(s): models.Subject.set_by_name(s, work=work) @@ -125,14 +157,18 @@ def attach_more_doab_metadata(edition, description, subjects, work.language = language work.save() - if authors and authors == authors: # test for authors != NaN - authlist = creator_list(authors) + if authors or editors: + authlist = creator_list(authors, editors) if edition.authors.all().count() < len(authlist): edition.authors.clear() if authlist is not None: for [rel, auth] in authlist: edition.add_author(auth, rel) + for doi in dois if dois else []: + if not edition.work.doi: + models.Identifier.set('doi', doi, work=edition.work) + break return edition def add_all_isbns(isbns, work, language=None, title=None): @@ -152,17 +188,16 @@ def add_all_isbns(isbns, work, language=None, title=None): return work, first_edition def load_doab_edition(title, doab_id, url, format, rights, - language, isbns, - provider, **kwargs): - + language, isbns, provider, dois=None, **kwargs): """ load a record from doabooks.org represented by input parameters and return an ebook """ - logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider)) + logger.info('load doab %s %s %s %s %s', doab_id, format, rights, language, provider) + url = url.strip() if language and isinstance(language, list): language = language[0] if language == 'xx' and format == 'online': - language = scrape_language(url) + language = doab_lang_to_iso_639_1(scrape_language(url)) # check to see whether the Edition hasn't already been loaded first # search by url ebooks = models.Ebook.objects.filter(url=url) @@ -181,31 +216,34 @@ def load_doab_edition(title, doab_id, url, format, rights, ebook = None if len(ebooks) > 1: raise Exception("There is more than one Ebook matching url {0}".format(url)) - elif len(ebooks) == 1: + if len(ebooks) == 1: ebook = ebooks[0] - doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id, - work=ebook.edition.work) - if not ebook.rights: - ebook.rights = rights - ebook.save() - - # update the cover id - cover_url = update_cover_doab(doab_id, ebook.edition, redo=False) - - # attach more metadata - attach_more_doab_metadata( - ebook.edition, - description=unlist(kwargs.get('description')), - subjects=kwargs.get('subject'), - publication_date=unlist(kwargs.get('date')), - publisher_name=unlist(kwargs.get('publisher')), - language=language, - authors=kwargs.get('creator'), - ) - # make sure all isbns are added - add_all_isbns(isbns, ebook.edition.work, language=language, title=title) - return ebook.edition - + if not ebook.edition.work.doab or ebook.edition.work.doab == doab_id: + models.Identifier.get_or_add(type='doab', value=doab_id, work=ebook.edition.work) + + if not ebook.rights: + ebook.rights = rights + ebook.save() + + # update the cover id + update_cover_doab(doab_id, ebook.edition, redo=False) + + # attach more metadata + attach_more_doab_metadata( + ebook.edition, + description=unlist(kwargs.get('description')), + subjects=kwargs.get('subject'), + publication_date=unlist(kwargs.get('date')), + publisher_name=unlist(kwargs.get('publisher')), + language=language, + authors=kwargs.get('creator'), + dois=dois, + ) + # make sure all isbns are added + add_all_isbns(isbns, ebook.edition.work, language=language, title=title) + return ebook.edition + # don't add a second doab to an existing Work + return None # remaining case --> no ebook, load record, create ebook if there is one. assert not ebooks @@ -254,7 +292,7 @@ def load_doab_edition(title, doab_id, url, format, rights, if editions_with_ebooks: edition = editions_with_ebooks[0] elif work.editions.all(): - edition = work.editions.all()[0] + edition = work.editions.first() else: edition = models.Edition(work=work, title=title) edition.save() @@ -263,9 +301,12 @@ def load_doab_edition(title, doab_id, url, format, rights, work.selected_edition = edition work.save() - if format in ('pdf', 'epub', 'mobi', 'html', 'online') and rights: + if format in ('pdf', 'epub', 'html', 'online') and rights: ebook = models.Ebook() - ebook.format = format + if format == 'online' and provider in STOREPROVIDERS: + ebook.format = 'bookstore' + else: + ebook.format = format ebook.provider = provider ebook.url = url ebook.rights = rights @@ -286,7 +327,15 @@ def load_doab_edition(title, doab_id, url, format, rights, publication_date=unlist(kwargs.get('date')), publisher_name=unlist(kwargs.get('publisher')), authors=kwargs.get('creator'), + editors=kwargs.get('editor'), + dois=dois, ) + if rights: + for ebook in edition.ebooks.all(): + if not ebook.rights: + ebook.rights = rights + ebook.save() + return edition # @@ -294,7 +343,8 @@ def load_doab_edition(title, doab_id, url, format, rights, # au = re.compile(r'\(Authors?\)', flags=re.U) -ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U) +ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', + flags=re.U) tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U) ai = re.compile(r'\([^\)]*(Introduction|Foreword)[^\)]*\)', flags=re.U) ds = re.compile(r'\([^\)]*(designer)[^\)]*\)', flags=re.U) @@ -311,14 +361,11 @@ def fnf(auth): parts = re.sub(r' +', u' ', auth).split(u',') if len(parts) == 1: return parts[0].strip() - elif len(parts) == 2: + if len(parts) == 2: return u'{} {}'.format(parts[1].strip(), parts[0].strip()) - else: - if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'): - return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip()) - #print auth - #print re.search(namelist,auth).group(0) - return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip()) + if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'): + return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip()) + return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip()) def creator(auth, editor=False): @@ -339,51 +386,70 @@ def creator(auth, editor=False): auth = au.sub('', auth) return ['aut', fnf(auth)] -def creator_list(creators): +def creator_list(creators, editors): auths = [] - for auth in creators: - auths.append(creator(auth)) + if creators: + for auth in creators: + auths.append(creator(auth)) + if editors: + for auth in editors: + auths.append(creator(auth, editor=True)) return auths -DOAB_OAIURL = 'https://www.doabooks.org/oai' -DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*') +DOAB_OAIURL = 'https://directory.doabooks.org/oai/request' +DOAB_PATT = re.compile(r'oai:directory\.doabooks\.org:(.*)') mdregistry = MetadataRegistry() -mdregistry.registerReader('oai_dc', oai_dc_reader) +mdregistry.registerReader('oai_dc', doab_reader) doab_client = Client(DOAB_OAIURL, mdregistry) isbn_cleaner = identifier_cleaner('isbn', quiet=True) -ISBNSEP = re.compile(r'[/]+') +doi_cleaner = identifier_cleaner('doi', quiet=True) +ISBNSEP = re.compile(r'[/;]+') def add_by_doab(doab_id, record=None): try: record = record if record else doab_client.getRecord( metadataPrefix='oai_dc', - identifier='oai:doab-books:{}'.format(doab_id) + identifier='oai:directory.doabooks.org:{}'.format(doab_id) ) + if not record[1]: + logger.error('No content in record %s', record) + return None metadata = record[1].getMap() isbns = [] - url = None + dois = [] + urls = [] + for ident in metadata.pop('isbn', []): + isbn_strings = ISBNSEP.split(ident[6:].strip()) + for isbn_string in isbn_strings: + isbn = isbn_cleaner(isbn_string) + if isbn: + isbns.append(isbn) + for ident in metadata.pop('doi', []): + ident = doi_cleaner(ident) + if ident: + dois.append(ident) for ident in metadata.pop('identifier', []): - if ident.startswith('ISBN: '): - isbn_strings = ISBNSEP.split(ident[6:].strip()) - for isbn_string in isbn_strings: - isbn = isbn_cleaner(isbn_string) - if isbn: - isbns.append(isbn) - elif ident.find('doabooks.org') >= 0: + if ident.find('doabooks.org') >= 0: # should already know the doab_id continue - else: - url = ident + if ident.startswith('http'): + urls.append(ident) language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None))) - urls = online_to_download(url) + xurls = [] + for url in urls: + xurls += online_to_download(url) + urls = xurls edition = None title = unlist(metadata.pop('title', None)) license = cc.license_from_cc_url(unlist(metadata.pop('rights', None))) + if title == None: + logger.error(f'doab record {doab_id} has no title') + return None for dl_url in urls: format = type_for_url(dl_url) if 'format' in metadata: del metadata['format'] - edition = load_doab_edition( + added_edition = load_doab_edition( title, doab_id, dl_url, @@ -391,25 +457,14 @@ def add_by_doab(doab_id, record=None): license, language, isbns, - url_to_provider(dl_url) if dl_url else None, - **metadata - ) - else: - if 'format' in metadata: - del metadata['format'] - edition = load_doab_edition( - title, - doab_id, - '', - '', - license, - language, - isbns, - None, + models.Ebook.infer_provider(dl_url) if dl_url else None, + dois=dois, **metadata ) + edition = added_edition if added_edition else edition return edition - except IdDoesNotExistError: + except IdDoesNotExistError as e: + logger.error(e) return None @@ -419,30 +474,66 @@ def getdoab(url): return id_match.group(1) return False -def load_doab_oai(from_year=None, limit=100000): + +def get_doab_record(doab_id): + record_id = 'oai:directory.doabooks.org:%s' % doab_id + try: + return doab_client.getRecord(metadataPrefix='oai_dc', identifier=record_id) + except IdDoesNotExistError: + return None + +def load_doab_oai(from_date, until_date, limit=100): ''' use oai feed to get oai updates ''' - if from_year: - from_ = datetime.datetime(year=from_year, month=1, day=1) - else: + start = datetime.datetime.now() + if from_date: + from_ = from_date + else: # last 15 days from_ = datetime.datetime.now() - datetime.timedelta(days=15) - doab_ids = [] - for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_): - if not record[1]: - continue - item_type = unlist(record[1].getMap().get('type', None)) - if item_type != 'book': - continue - idents = record[1].getMap()['identifier'] - if idents: - for ident in idents: - doab = getdoab(ident) - if doab: - doab_ids.append(doab) + num_doabs = 0 + new_doabs = 0 + lasttime = datetime.datetime(2000, 1, 1) + try: + for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_, + until=until_date): + if not record[1]: + continue + item_types = [str(t).strip().lower() for t in (record[1].getMap().get('type') or [])] + if 'book' not in item_types: + continue + ident = record[0].identifier() + datestamp = record[0].datestamp() + lasttime = datestamp if datestamp > lasttime else lasttime + doab = getdoab(ident) + if doab: + num_doabs += 1 + try: e = add_by_doab(doab, record=record) - title = e.title if e else None - logger.info(u'updated:\t{}\t{}'.format(doab, title)) - if len(doab_ids) > limit: - break + except Exception as ex: + logger.exception('unexpected error processing doab #%s: %s', doab, ex) + continue + if not e: + logger.error('null edition for doab #%s', doab) + continue + if e.created > start: + new_doabs += 1 + title = e.title if e else None + logger.info(u'updated:\t%s\t%s', doab, title) + if limit is not None and num_doabs >= limit: + break + except NoRecordsMatchError: + pass + except urllib.error.HTTPError as e: + if e.code == 429: + retry_after = e.headers.get('Retry-After', 'unknown') + logger.error( + 'DOAB OAI rate-limited (HTTP 429). ' + 'Retry-After: %s seconds. Harvest stopped after %s records.', + retry_after, num_doabs + ) + else: + raise + return num_doabs, new_doabs, lasttime + \ No newline at end of file diff --git a/core/loaders/doab_utils.py b/core/loaders/doab_utils.py index ceef8bb70..11590eb7c 100644 --- a/core/loaders/doab_utils.py +++ b/core/loaders/doab_utils.py @@ -2,129 +2,159 @@ doab_utils.py """ - +import logging import re -import urlparse +from ssl import SSLError +from urllib.parse import urljoin import requests -from regluit.utils.lang import get_language_code -from .utils import get_soup - -# utility functions for converting lists of individual items into individual items - -# let's do a mapping of the DOAB languages into the language codes used -# mostly, we just handle mispellings -# also null -> xx - -EXTRA_LANG_MAP = dict([ - (u'chinese', 'de'), - (u'deutsch', 'de'), - (u'eng', 'en'), - (u'englilsh', 'en'), - (u'englilsh', 'en'), - (u'englisch', 'en'), - (u'espanol', 'es'), - (u'ger', 'de'), - (u'fra', 'fr'), - (u'fre', 'fr'), - (u'francese', 'fr'), - (u'ita', 'it'), - (u'italiano', 'it'), - (u'norwegian', 'no'), - (u'por', 'pt'), - (u'portugese', 'pt'), - (u'slovene', 'sl'), - (u'spa', 'es'), - (u'spagnolo', 'es'), -]) - -sep = re.compile(r'[ \-;^,/]+') +from oaipmh.metadata import MetadataReader + +from django.conf import settings + +from regluit.core import models +from regluit.utils.lang import lang_to_language_code +from .soup import get_soup + + +logger = logging.getLogger(__name__) + def doab_lang_to_iso_639_1(lang): - if lang is None or not lang: - return "xx" - else: - lang = sep.split(lang)[0] - code = get_language_code(lang) - if code: - return code - else: - return EXTRA_LANG_MAP.get(lang.lower(), 'xx') - - -DOMAIN_TO_PROVIDER = dict([ - [u'antropologie.zcu.cz', u'AntropoWeb'], - [u'books.mdpi.com', u'MDPI Books'], - [u'books.openedition.org', u'OpenEdition Books'], - [u'books.scielo.org', u'SciELO'], - [u'ccdigitalpress.org', u'Computers and Composition Digital Press'], - [u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'], - [u'dl.dropboxusercontent.com', u'Dropbox'], - [u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'], - [u'dx.doi.org', u'DOI Resolver'], - [u'ebooks.iospress.nl', u'IOS Press Ebooks'], - [u'hdl.handle.net', u'Handle Proxy'], - [u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'], - [u'img.mdpi.org', u'MDPI Books'], - [u'ledibooks.com', u'LediBooks'], - [u'leo.cilea.it', u'LEO '], - [u'leo.cineca.it', u'Letteratura Elettronica Online'], - [u'link.springer.com', u'Springer'], - [u'oapen.org', u'OAPEN Library'], - [u'press.openedition.org', u'OpenEdition Press'], - [u'windsor.scholarsportal.info', u'Scholars Portal'], - [u'www.adelaide.edu.au', u'University of Adelaide'], - [u'www.aliprandi.org', u'Simone Aliprandi'], - [u'www.antilia.to.it', u'antilia.to.it'], - [u'www.aupress.ca', u'Athabasca University Press'], - [u'www.bloomsburyacademic.com', u'Bloomsbury Academic'], - [u'www.co-action.net', u'Co-Action Publishing'], - [u'www.degruyter.com', u'De Gruyter Online'], - [u'www.doabooks.org', u'Directory of Open Access Books'], - [u'www.dropbox.com', u'Dropbox'], - [u'www.ebooks.iospress.nl', u'IOS Press Ebooks'], - [u'www.ledizioni.it', u'Ledizioni'], - [u'www.maestrantonella.it', u'maestrantonella.it'], - [u'www.oapen.org', u'OAPEN Library'], - [u'www.openbookpublishers.com', u'Open Book Publishers'], - [u'www.palgraveconnect.com', u'Palgrave Connect'], - [u'www.scribd.com', u'Scribd'], - [u'www.springerlink.com', u'Springer'], - [u'www.ubiquitypress.com', u'Ubiquity Press'], - [u'www.unimib.it', u'University of Milano-Bicocca'], - [u'www.unito.it', u"University of Turin"], -]) - -def url_to_provider(url): - netloc = urlparse.urlparse(url).netloc - return DOMAIN_TO_PROVIDER.get(netloc, netloc) - -FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)') + lang = lang_to_language_code(lang) + return lang if lang else 'xx' + + +doab_reader = MetadataReader( + fields={ + 'title': ('textList', 'oai_dc:dc/dc:title/text()'), + 'creator': ('textList', 'oai_dc:dc/dc:creator/text()'), + 'subject': ('textList', 'oai_dc:dc/dc:subject/text()'), + 'description': ('textList', 'oai_dc:dc/dc:description/text()'), + 'publisher': ('textList', 'oai_dc:dc/dc:publisher/text()'), + 'editor': ('textList', 'oai_dc:dc/dc:contributor[@type="Editor"]/text()'), + 'date': ('textList', 'oai_dc:dc/dc:date[@type="Issued"]/text()'), + 'type': ('textList', 'oai_dc:dc/oaire:resourceType/text()'), + 'format': ('textList', 'oai_dc:dc/dc:format/text()'), + 'identifier': ('textList', 'oai_dc:dc/dc:identifier/text()'), + 'source': ('textList', 'oai_dc:dc/dc:source/text()'), + 'language': ('textList', 'oai_dc:dc/dc:language/text()'), + 'relation': ('textList', 'oai_dc:dc/dc:relation/text()'), + 'coverage': ('textList', 'oai_dc:dc/dc:coverage/text()'), + 'rights': ('textList', 'oai_dc:dc/oaire:licenseCondition/@uri'), + 'isbn': ('textList', 'oai_dc:dc/dc:alternateIdentifier[@type="ISBN"]/text()'), + 'doi': ('textList', 'oai_dc:dc/dc:alternateIdentifier[@type="DOI"]/text()'), + }, + namespaces={ + 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', + 'dc' : 'http://purl.org/dc/elements/1.1/', + 'grantor': 'http://purl.org/dc/elements/1.1/', + 'publisher': 'http://purl.org/dc/elements/1.1/', + 'oapen': 'http://purl.org/dc/elements/1.1/', + 'oaire': 'https://raw.githubusercontent.com/rcic/openaire4/master/schemas/4.0/oaire.xsd', + 'datacite': 'https://schema.datacite.org/meta/kernel-4.1/metadata.xsd', + 'doc': 'http://www.lyncode.com/xoai' + } +) +STOREPROVIDERS = [ + '7switch.com', + 'amazon.ca', + 'amazon.co.uk', + 'amazon.com', + 'amazon.de', + 'amzn.to', + 'apress.com', + 'bloomsbury.com', + 'bod.de', + 'booksdirect.co.za', + 'cabi.org', + 'cdcshoppingcart.uchicago.edu', + 'checkout.sas.ac.uk', + 'duncker-humblot.de', + 'dykinson.com', + 'e-elgar.com', + 'edicions.ub.edu', + 'epubli.de', + 'eurekaselect.com', + 'fondazionecafoscari.storeden.com', + 'global.oup.com', + 'iospress.nl', + 'karolinum.cz', + 'librumstore.com', + 'logos-verlag.de', + 'manchesteruniversitypress.co.uk', + 'mitpress.mit.edu', + 'munishop.muni.cz', + 'nai010.com', + 'nomos-shop.de', + 'palgrave.com', + 'placedeslibraires.fr', + 'play.google.com', + 'press.umich.edu', + 'pressesuniversitairesdeliege.be', + 'publicacions.ub.edu', + 'publicacions.urv.cat', + 'schueren-verlag.de', + 'sci.fo', + 'store.printservice.nl', + 'una-editions.fr', + 'universitaetsverlag.uni-kiel.de', + 'universitetsforlaget.no', + 'urldefense.com', + 'usu.edu', + 'uwapress.uw.edu', + 'wbg-wissenverbindet.de', + 'zalozba.zrc-sazu.si', +] def online_to_download(url): urls = [] if not url: return urls - if url.find(u'mdpi.com/books/pdfview/book/') >= 0: - doc = get_soup(url) - if doc: - obj = doc.find('object', type='application/pdf') - if obj: - urls.append(obj['data'].split('#')[0]) - elif url.find(u'books.scielo.org/') >= 0: - doc = get_soup(url) - if doc: - obj = doc.find('a', class_='pdf_file') - if obj: - urls.append(urlparse.urljoin(url, obj['href'])) - obj = doc.find('a', class_='epub_file') - if obj: - urls.append(urlparse.urljoin(url, obj['href'])) - elif FRONTIERSIN.search(url): - booknum = FRONTIERSIN.search(url).group(1) - urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum)) - urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum)) + + elif url.find(u'edp-open.org/books-in-') >= 0: + # pages needing multi-scrape + return urls else: urls.append(url) + if not urls: + logging.warning('no valid download urls for %s', url) return urls + +STREAM_QUERY = 'https://directory.doabooks.org/rest/search?query=handle:{}&expand=bitstreams' + +def get_streamdata(handle): + url = STREAM_QUERY.format(handle) + try: + response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}, timeout=(5, 60)) + if response.status_code == 429: + retry_after = response.headers.get('Retry-After', 'unknown') + logger.error('DOAB bitstream API rate-limited (HTTP 429) for %s. Retry-After: %s', handle, retry_after) + return None + items = response.json() + if items: + for stream in items[0]['bitstreams']: + if stream['bundleName'] == "THUMBNAIL": + stream['handle'] = handle + return stream + else: + logger.error("No items in streamdata for %s", handle) + except requests.exceptions.RequestException as e: + logger.error(e) + except SSLError as e: + logger.error(e) + except ValueError as e: + # decoder error + logger.error(e) + +COVER_FSTRING = "https://directory.doabooks.org/bitstream/handle/{handle}/{name}?sequence={sequenceId}&isAllowed=y" +def doab_cover(doab_id): + stream_data = get_streamdata(doab_id) + if not stream_data: + logger.error('get_streamdata failed for %s', doab_id) + return None + if 'retrieveLink' in stream_data: + return f"https://directory.doabooks.org{stream_data['retrieveLink']}" + return COVER_FSTRING.format(**stream_data) + diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py new file mode 100644 index 000000000..3d9c1898d --- /dev/null +++ b/core/loaders/harvest.py @@ -0,0 +1,1553 @@ +""" +code for harvesting 'online' ebooks +""" +import json +import logging +import re +import time +from urllib.parse import quote, unquote, urljoin, urlparse, urlsplit, urlunsplit + +import requests + +from django.conf import settings +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage + +from regluit.core import models +from regluit.core.models import loader +from regluit.core.parameters import GOOD_PROVIDERS, DOWNLOADABLE +from regluit.core.pdf import staple_pdf + +from .soup import get_soup +from .doab_utils import STOREPROVIDERS + +logger = logging.getLogger(__name__) + +DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"') +DELAY = 1.0 + +class RateLimiter(object): + def __init__(self): + self.last = {} + + def delay(self, provider): + if provider in self.last: + prev = self.last[provider] + pres = time.time() + if pres - prev < DELAY: + time.sleep(float(DELAY - pres + prev)) + self.last[provider] = time.time() + return + +rl = RateLimiter() + + +def set_bookshop(ebook): + ebook.format = 'bookshop' + ebook.save() + return None, 0 + +def set_broken(ebook): + ebook.format = 'broken' + ebook.save() + return None, 0 + + +def dl_online(ebook, limiter=rl.delay, format='online', force=False): + if ebook.format != format or (not force and ebook.provider in DONT_HARVEST): + return None, 0 + if ebook.provider in STOREPROVIDERS: + return set_bookshop(ebook) + if ebook.ebook_files.exists(): + return ebook.ebook_files.first(), 0 + for do_harvest, harvester in harvesters(ebook): + if do_harvest: + for ebf in ebf_if_harvested(ebook.url): + clean_archive(ebf) + return ebf, 0 + limiter(ebook.provider) + return harvester(ebook) + return None, 0 + + +def archive_dl(ebook, limiter=rl.delay, force=False): + """ status codes + 0 : archive exists + 1 : archive made + -1 : urls does not return an ebook file + """ + status = -1 + ebf = None + if ebook.ebook_files.filter(asking=False).exists(): + status = 0 + elif models.EbookFile.objects.filter(source=ebook.url, format=ebook.format).exists(): + status = 0 + else: + dl_cf, fmt = loader.load_ebookfile(ebook.url, ebook.format) + if dl_cf: + ebf, num = make_harvested_ebook(dl_cf, ebook, fmt, filesize=dl_cf.size) + clean_archive(ebf) + status = 1 + else: + logger.warning('download format %s for %s is not ebook', ebook.format, ebook.url) + if fmt == 404: + set_broken(ebook) + limiter(ebook.provider) + if not ebf: + status = -1 + return status + + +def clean_archive(ebf): + fsize = ebf.ebook.filesize + ebook = ebf.ebook + if not fsize or ebf.asking == 1 or ebook.format not in DOWNLOADABLE or not ebook.active: + return + # find duplicate files by looking at filesize + old_ebooks = models.Ebook.objects.filter(filesize=fsize, provider=ebf.ebook.provider, + edition__work=ebf.edition.work, format=ebf.format + ).exclude(id=ebf.ebook.id) + for old_ebook in old_ebooks: + old_ebook.active = False + for oldebf in old_ebook.ebook_files.exclude(id=ebf.id): + if oldebf.file != ebf.file: + # save storage by deleting redundant files + oldebf.file.delete() + oldebf.file = ebf.file + oldebf.save() + old_ebook.save() + + +CMPPROVIDERS = [ + 'books.atla.com', + 'books.open.tudelft.nl', + 'ebooks.epublishing.ekt.gr', + 'ebooks.marilia.unesp.br', + 'ebooks.uminho.pt', + 'editorial.inudi.edu.pe', + 'editorial.ucatolicaluisamigo.edu.co', + 'editorial.uniagustiniana.edu.co', + 'editorialgrupo-aea.com', + 'fcjp.derecho.unap.edu.pe', + 'fedoabooks.unina.it', + 'humanities-digital-library.org', + 'idicap.com', + 'libri.unimi.it', + 'libros.fahce.unlp.edu.ar', + 'libros.unad.edu.co', + 'libros.usc.edu.co', + 'llibres.urv.cat', + 'monografias.editorial.upv.es', + 'monograph.com.ua', + 'monographs.uc.pt', + 'omp.ub.rub.de', + 'openuctpress.uct.ac.za', + 'omp.zrc-sazu.si', + 'openbooks.uct.ac.za', + 'openpress.mtsu.edu', + 'omp.ub.rub.de', + 'ozguryayinlari.com', + 'penerbit.brin.go.id', + 'press.uni.lodz.pl', + 'redliclibros.com', + 'Scholars Portal', + 'teiresias-supplements.mcgill.ca', + 'textbooks.open.tudelft.nl', + 'unicapress.unica.it', +] + +DSPACEPROVIDERS = [ + 'acikerisim.kapadokya.edu.tr', + 'diposit.ub.edu', + 'orbi.ulg.ac.be', + 'orbi.uliege.be', + 'publikationen.uni-tuebingen.de', + '', +] + +DONT_HARVEST = [ + 'Unglue.it', + 'Github', + 'Project Gutenberg', + 'Google Books', + 'OpenEdition Books', +] +MANUAL_HARVEST = [ + 'cabidigitallibrary.org', + 'books.google.be', + 'books.google.ch', + 'books.google.nl', +] + + +def harvesters(ebook): + yield ebook.provider == 'OAPEN Library', harvest_oapen + yield ebook.provider == 'SciELO', harvest_scielo + yield ebook.provider in GOOD_PROVIDERS, harvest_generic + yield ebook.provider in MANUAL_HARVEST, harvest_manual + yield 'dropbox.com/s/' in ebook.url, harvest_dropbox + yield ebook.provider == 'jbe-platform.com', harvest_jbe + yield ebook.provider == u'De Gruyter Online', harvest_degruyter + yield ebook.provider == 'Open Book Publishers', harvest_obp + yield ebook.provider == 'Transcript-Verlag', harvest_transcript + yield ebook.provider == 'shop.budrich.de', harvest_budrich + yield ebook.provider == 'ksp.kit.edu', harvest_ksp + yield ebook.provider in ['repositorio.americana.edu.co'], harvest_dspace2 + yield ebook.provider == 'nomos-elibrary.de', harvest_nomos + yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis + yield 'frontiersin.org' in ebook.provider, harvest_frontiersin + yield ebook.provider in ['Palgrave Connect', 'Springer', 'springer.com'], harvest_springerlink + yield ebook.provider == 'pulp.up.ac.za', harvest_pulp + yield ebook.provider == 'bloomsburycollections.com', harvest_bloomsbury + yield ebook.provider == 'Athabasca University Press', harvest_athabasca + yield 'digitalcommons.usu.edu' in ebook.url, harvest_usu + yield ebook.provider in ['digital.library.unt.edu', 'texashistory.unt.edu'], harvest_unt + yield ebook.provider in DSPACEPROVIDERS, harvest_dspace + yield ebook.provider == 'e-publish.uliege.be', harvest_liege + yield ebook.provider in CMPPROVIDERS, harvest_cmp + yield 'mdpi' in ebook.provider.lower(), harvest_mdpi + yield ebook.provider == 'idunn.no', harvest_idunn + yield ebook.provider == 'press.ucalgary.ca', harvest_calgary + yield ebook.provider in ['Ledizioni', 'bibsciences.org', + 'heiup.uni-heidelberg.de', 'e-archivo.uc3m.es'], harvest_generic + yield ebook.provider in ['funlam.edu.co'], harvest_generic_chrome + yield ebook.provider == 'muse.jhu.edu', harvest_muse + yield ebook.provider == 'direct.mit.edu', harvest_mitpress + yield ebook.provider == 'IOS Press Ebooks', harvest_ios + yield ebook.provider == 'elgaronline.com', harvest_elgar + yield ebook.provider == 'worldscientific.com', harvest_wsp + yield ebook.provider in ['edition-open-access.de', 'edition-open-sources.org'], harvest_mprl + yield ebook.provider == 'rti.org', harvest_rti + yield ebook.provider == 'edoc.unibas.ch', harvest_unibas + yield 'pensoft' in ebook.provider, harvest_pensoft + yield ebook.provider == 'edp-open.org', harvest_edp + yield ebook.provider == 'laboutique.edpsciences.fr', harvest_edpsciences + yield ebook.provider == 'waxmann.com', harvest_waxmann + yield ebook.provider == 'pbsociety.org.pl', harvest_ojs + yield 'sciendo.com' in ebook.provider, harvest_sciendo + yield ebook.provider == 'edition-topoi.org', harvest_topoi + yield ebook.provider == 'meson.press', harvest_meson + yield 'brill' in ebook.provider, harvest_brill + yield ebook.provider == 'DOI Resolver', harvest_doi + yield ebook.provider in ['apps.crossref.org', 'mr.crossref.org'], harvest_doi_coaccess + yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab + yield ebook.provider == 'libros.uchile.cl', harvest_libroschile + yield ebook.provider == 'smithsonian.figshare.com', harvest_figshare + yield ebook.provider == 'fupress.com', harvest_fupress + yield ebook.provider == 'funlam.edu.co', harvest_funlam + yield ebook.provider == 'elibrary.duncker-humblot.com', harvest_dunckerhumblot + yield ebook.provider == 'cornellopen.org', harvest_cornellopen + yield ebook.provider == 'esv.info', harvest_esv + yield ebook.provider == 'fulcrum.org', harvest_fulcrum + yield ebook.provider in ('epress.lib.uts.edu.au', 'utsepress.lib.uts.edu.au'), harvest_ubiquity + yield ebook.provider == 'orkana.no', harvest_orkana + yield ebook.provider == 'euna.una.ac.cr', harvest_euna + yield ebook.provider == 'openresearchlibrary.org', harvest_orl + yield ebook.provider == 'pressesagro.be', harvest_pressesagro + yield ebook.provider == 'buponline.com', harvest_buponline + yield ebook.provider == 'intechopen.com', harvest_intech + yield ebook.provider == 'usmcu.edu', harvest_usmcu + yield ebook.provider == 'lalibreria.upv.es', harvest_upv + yield ebook.provider == 'cambridge.org', harvest_cambridge + yield ebook.provider == 'exonpublications.com', harvest_exon + yield ebook.provider == 'ressources.una-editions.fr', harvest_una + yield ebook.provider == 'wbg-wissenverbindet.de', harvest_wbg + yield ebook.provider == 'urn.kb.se', harvest_kb + yield ebook.provider == 'publikationen.bibliothek.kit.edu', harvest_kit + yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_istanbul + yield ebook.provider == 'editorialbonaventuriana.usb.edu.co', harvest_editorialbonaventuriana + yield ebook.provider == 'verlag.gta.arch.ethz.ch', harvest_gta + yield ebook.provider == 'manchesteruniversitypress.co.uk', harvest_manu + yield ebook.provider == 'tectum-elibrary.de', harvest_tecnum + yield ebook.provider == 'benjamins.com', harvest_benjamins + yield ebook.provider == 'macau.uni-kiel.de', harvest_citation_meta_generic + yield ebook.provider == 'tabedizioni.it', harvest_tabedizioni + + +def ebf_if_harvested(url): + onlines = models.EbookFile.objects.filter(source=url) + if onlines.exists(): + return onlines + return models.EbookFile.objects.none() + + +def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET', + verify=True, fallback=None): + if not url: + logger.warning('no url for ebook %s', ebook.id) + return None, 0 + logger.info('making %s' % url) + + # check to see if url already harvested + for ebf in ebf_if_harvested(url): + # these ebookfiles are created to short-circuit dl_online to avoid re-harvest + if ebf.ebook == ebook: + return ebf, 0 + new_ebf = models.EbookFile.objects.create( + edition=ebf.edition, + format=ebf.format, + file=ebf.file, + source=ebook.url, + ebook=ebook, + ) + logger.info("Previously harvested") + return new_ebf, 0 + + dl_cf, fmt = loader.load_ebookfile(url, ebook.format, + user_agent=user_agent, method=method, verify=verify) + if dl_cf: + return make_harvested_ebook(dl_cf, ebook, fmt, filesize=dl_cf.size) + else: + if fmt == 404: + return set_broken(ebook) + logger.warning('download format %s for %s is not ebook', ebook.format, url) + if fallback: + return fallback(url, ebook) + return None, 0 + + +def redirect_ebook(ebook, verify=True): + """ returns an ebook and status : + -3 : bad return code or problem + -1 : deleted + -2 : dead, but we need to keep items + 0 : replaced with existing + 1 : url updated + + """ + try: + r = requests.head(ebook.url, allow_redirects=True, verify=verify) + except requests.exceptions.ConnectionError as e: + logger.error("Connection refused for %s", ebook.url) + logger.error(e) + return ebook, -3 + + if r.status_code == 404: + if not models.Ebook.ebook_files.exists(): + logger.info('deleting ebook for dead url', ebook.url) + ebook.delete() + return None, -1 + return ebook, -2 + elif r.status_code == 200: + if ebook.url != r.url: + if models.Ebook.objects.exclude(id=ebook.id).filter(url=r.url).exists(): + existing = models.Ebook.objects.filter(url=r.url)[0] + logger.error(f'ebook {ebook.id} redirects to existing {existing.id}') + ebook.format='redirect' + ebook.save() + return existing, 0 + ebook.url = r.url + ebook.set_provider() + ebook.save() + return ebook, 1 + return ebook, 0 + + logger.error("status code %s for %s", r.status_code, ebook.url) + return ebook, -3 + + +def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False): + pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers) + if not pdffile: + return None, 0 + return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf') + + +def make_harvested_ebook(content, ebook, format, filesize=0): + if not filesize: + filesize = len(content) + new_ebf = models.EbookFile.objects.create( + edition=ebook.edition, + format=format, + source=ebook.url, + ) + try: + new_ebf.file.save(models.path_for_file(new_ebf, None), content) + new_ebf.save() + except MemoryError: #huge pdf files cause problems here + logger.error("memory error saving ebook file for %s", ebook.url) + new_ebf.delete() + return None, 0 + if ebook.format == "online": + harvested_ebook = models.Ebook.objects.create( + edition=ebook.edition, + format=format, + provider='Unglue.it', + url=new_ebf.file.url, + rights=ebook.rights, + filesize=filesize if filesize < 2147483647 else 2147483647, # largest safe integer + version_label=ebook.version_label, + version_iter=ebook.version_iter, + ) + else: + if not ebook.filesize: + ebook.filesize = filesize if filesize < 2147483647 else 2147483647 + ebook.save() + harvested_ebook = ebook + + new_ebf.ebook = harvested_ebook + new_ebf.save() + return new_ebf, 1 + + +def is_bookshop_url(url): + if '/prodotto/' in url: + return True + if ':' in url and url.split(':')[1].startswith('//library.oapen.org/handle/'): + return True + return False + + +def harvest_generic(ebook, user_agent=settings.USER_AGENT): + if is_bookshop_url(ebook.url): + return set_bookshop(ebook) + return make_dl_ebook(ebook.url, ebook, user_agent=user_agent) + +def harvest_generic_chrome(ebook, ): + return make_dl_ebook(ebook.url, ebook, user_agent=settings.CHROME_UA) + + +def harvest_manual(ebook): + def make_manual_ebf(format): + fname = f'mebf/{ebook.id}.{format}' + if default_storage.exists(fname): + filesize = default_storage.size(fname) + new_ebf = models.EbookFile.objects.create( + edition=ebook.edition, + format=format, + source=ebook.url, + ) + new_ebf.file.name = fname + harvested_ebook = models.Ebook.objects.create( + edition=ebook.edition, + format=format, + provider='Unglue.it', + url=new_ebf.file.url, + rights=ebook.rights, + filesize=filesize, + version_label=ebook.version_label, + version_iter=ebook.version_iter, + ) + new_ebf.ebook = harvested_ebook + new_ebf.save() + return new_ebf + else: + return None + pdf_ebf = make_manual_ebf('pdf') + epub_ebf = make_manual_ebf('epub') + + return pdf_ebf or epub_ebf, (1 if pdf_ebf else 0) + (1 if epub_ebf else 0) + + +def harvest_oapen(ebook): + def detect_requestcopy(url, ebook): + doc = get_soup(url, follow_redirects=True) + try: + doc_title = doc.find('title').text + if 'Request a copy' in doc_title: + set_broken(ebook) + except: + pass + return None, 0 + if is_bookshop_url(ebook.url): + return set_bookshop(ebook) + if '/bitstream/' in ebook.url: + return make_dl_ebook(ebook.url, ebook, user_agent=settings.USER_AGENT, + fallback=detect_requestcopy) + return None, 0 + + +def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT): + doc = get_soup(ebook.url, user_agent=user_agent, follow_redirects=True) + if doc: + try: + base = doc.find('base')['href'] + except: + base = ebook.url + obj = selector(doc) + if obj: + dl_url = urljoin(base, obj['href']) + harvest = make_dl_ebook(dl_url, ebook, user_agent=user_agent) + if not harvest[0]: + logger.warning('couldn\'t harvest %s', dl_url) + return harvest + else: + logger.warning('couldn\'t get dl_url for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_multiple_generic(ebook, selector, dl=lambda x:x, + user_agent=settings.USER_AGENT, verify=True): + num = 0 + harvested = None + doc = get_soup(ebook.url, follow_redirects=True, user_agent=user_agent, verify=verify) + if doc: + found = [] + try: + base = doc.find('base')['href'] + except: + base = ebook.url + for obj in selector(doc): + dl_url = dl(urljoin(base, obj.get('href'))) + logger.info(dl_url) + if dl_url in found: + continue + else: + found.append(dl_url) + harvested, made = make_dl_ebook(dl_url, ebook, verify=verify) + num += made + if num == 0: + logger.warning('couldn\'t get any dl_url for %s', ebook.url) + return harvested, num + + +def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0, + user_agent=settings.GOOGLEBOT_UA, dl=lambda x:x): + doc = get_soup(ebook.url, user_agent=user_agent, follow_redirects=True) + if doc: + try: + base = doc.find('base')['href'] + except: + base = ebook.url + made = None + + # check for complete ebook + if selector: + obj = selector(doc) + if obj: + dl_url = dl(urljoin(base, obj['href'])) + made = make_dl_ebook(dl_url, ebook) + if made: + return made + + # staple the chapters + pdflinks = [dl(urljoin(base, a['href'])) for a in chap_selector(doc)] + stapled = None + if pdflinks: + stapled = make_stapled_ebook(pdflinks, ebook, user_agent=user_agent, + strip_covers=strip_covers) + if stapled: + return stapled + + logger.warning('couldn\'t make ebook file for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/book|books)/(10\.11647/OBP\.\d+|\d+)') + +def harvest_obp(ebook): + match = OPENBOOKPUB.search(ebook.url) + booknum = None + if not match: + return None, 0 + if match and match.group(1) in ('product', 'reader'): + prodnum = match.group(2) + prod_url = 'https://www.openbookpublishers.com/product/{}'.format(prodnum) + doc = get_soup(prod_url, settings.GOOGLEBOT_UA) + if doc: + obj = doc.find('button', value='Download') + if obj: + booknum = obj.get('onclick') + if booknum: + booknum = OPENBOOKPUB.search(booknum).group(2) + else: + logger.warning('couldn\'t get soup for %s', prod_url) + elif match and match.group(2).startswith('10.'): + dl_url = 'https://books.openbookpublishers.com/' + match.group(2).lower() + '.pdf' + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + else: + booknum = match.group(2) + if not booknum: + logger.warning('couldn\'t get booknum for %s', ebook.url) + return None, 0 + dl_url = 'https://www.openbookpublishers.com//download/book_content/{}'.format(booknum) + made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST') + return made + + +DEGRUYTERFULL = re.compile(r'/downloadpdf/title/.*') +DEGRUYTERCHAP = re.compile(r'/downloadpdf/book/.*') +COMPLETE = re.compile(r'complete ebook', flags=re.I) +DOWNLOAD = re.compile(r' *download *', flags=re.I) + +def harvest_degruyter(ebook): + ebook, status = redirect_ebook(ebook) + if status < 1: + return None, -1 if status < 0 else 0 + doc = get_soup(ebook.url, settings.GOOGLEBOT_UA) + if doc: + try: + base = doc.find('base')['href'] + except: + base = ebook.url + made = 0 + harvested = None + + # check for epub + obj = doc.select_one('a.ga_download_dropdown_epub_book') + if obj: + dl_url = urljoin(base, obj['href']) + harvested, made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + + # check for pdf + obj = doc.select_one('a.downloadCompletePdfBook') + if obj: + dl_url = urljoin(base, obj['href']) + harvested, madepdf = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + made = made + madepdf + if made: + return harvested, made + + # none yet, check for complete ebook + obj = doc.find('a', string=COMPLETE) + if obj: + obj = obj.parent.parent.parent.select_one('a.pdf-link') + else: + obj = doc.find('a', href=DEGRUYTERFULL) + if obj: + dl_url = urljoin(base, obj['href']) + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + + # staple the chapters + pdflinks = [urljoin(base, a['href']) for a in doc.find_all('a', href=DEGRUYTERCHAP)] + stapled = None + if pdflinks: + stapled = make_stapled_ebook(pdflinks, ebook, user_agent=settings.GOOGLEBOT_UA) + if stapled: + return stapled + logger.warning('couldn\'t get dl_url for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_dropbox(ebook): + if ebook.url.find(u'dl=0') >= 0: + dl_url = ebook.url.replace(u'dl=0', u'dl=1') + return make_dl_ebook(dl_url, ebook) + elif ebook.url.find(u'?') < 0: + dl_url = ebook.url + u'?dl=1' + return make_dl_ebook(dl_url, ebook) + response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT}) + if response.status_code == 200: + match_dl = DROPBOX_DL.search(response.content) + if match_dl: + return make_dl_ebook(match_dl.group(1), ebook) + else: + logger.warning('couldn\'t get %s', ebook.url) + else: + logger.warning('couldn\'t get dl for %s', ebook.url) + return None, 0 + + +def harvest_jbe(ebook): + def selector(doc): + return doc.select('div.access-options a[href]') + return harvest_multiple_generic(ebook, selector) + + +def harvest_transcript(ebook): + num = 0 + harvested = None + doc = get_soup(ebook.url) + if doc: + objs = doc.select('a.content--link') + for obj in objs: + dl_url = urljoin(ebook.url, obj['href']) + if dl_url.endswith('.pdf') or dl_url.endswith('.epub'): + harvested, made = make_dl_ebook(dl_url, ebook) + num += made + if not harvested: + logger.warning('couldn\'t get any dl_url for %s', ebook.url) + return harvested, num + + +def harvest_ksp(ebook): + def selector(doc): + return doc.select_one('p.linkForPDF a') + return harvest_one_generic(ebook, selector) + + +def harvest_digitalis(ebook): + def selector(doc): + return doc.select_one('a.item-download-button') + return harvest_one_generic(ebook, selector) + + +def harvest_kit(ebook): + def selector(doc): + return doc.select_one('a.downloadTextLink') + return harvest_one_generic(ebook, selector) + + +def harvest_budrich(ebook): + def selector(doc): + return doc.select_one('a.download_pdf') + return harvest_one_generic(ebook, selector) + + +NOMOSPDF = re.compile('download_full_pdf') +def harvest_nomos(ebook): + doc = get_soup(ebook.url, follow_redirects=True) + try: + base = doc.find('base')['href'] + except: + base = ebook.url + + if doc: + obj = doc.find('a', href=NOMOSPDF) + if obj: + dl_url = urljoin(base, obj['href']) + return make_dl_ebook(dl_url, ebook) + else: + logger.warning('will try stapling a book for %s', ebook.url) + + # staple the chapters + chaps = doc.select('li.access[data-doi]') + + pdflinks = [] + for chap in chaps: + link = urljoin( + 'https://www.nomos-elibrary.de', + chap['data-doi'] + '.pdf?download_full_pdf=1' + ) + if link not in pdflinks: + pdflinks.append(link) + stapled = None + if pdflinks: + stapled = make_stapled_ebook(pdflinks, ebook, user_agent=settings.GOOGLEBOT_UA) + if stapled: + return stapled + else: + logger.warning('couldn\'t staple ebook %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_frontiersin(ebook): + if 'GetFile.aspx' in ebook.url: + ebook.delete() + rl.last.pop(ebook.provider, 0) + return None, 0 + + num = 0 + harvested = None + doc = get_soup(ebook.url, follow_redirects=True) + if doc: + for obj in doc.select('button[data-href]'): + dl_url = obj['data-href'] + harvested, made = make_dl_ebook( + dl_url, + ebook, + user_agent=requests.utils.default_user_agent(), + ) + num += made + if num == 0: + logger.warning('couldn\'t get any dl_url for %s', ebook.url) + return harvested, num + + +def harvest_scielo(ebook): + def selector(doc): + return doc.select('a.pdf_file,a.epub_file') + if ebook.url.startswith('http;'): + ebook, status = redirect_ebook(ebook) + if status < 0: + return None, 0 + return harvest_multiple_generic(ebook, selector) + + +def harvest_springerlink(ebook): + def selector(doc): + return doc.select('a[data-book-epub],a[data-book-pdf]') + return harvest_multiple_generic(ebook, selector, user_agent=settings.CHROME_UA) + + +EDOCMAN = re.compile('component/edocman/') +def harvest_pulp(ebook): + def edocman(url): + if not EDOCMAN.search(url): + return + return url + '/download?Itemid=' + dl_url = edocman(ebook.url) + if dl_url: + return make_dl_ebook(dl_url, ebook, user_agent=requests.utils.default_user_agent()) + doc = get_soup(ebook.url) + harvested = None + made = 0 + if doc: + obj = doc.find('a', href=EDOCMAN) + if obj: + dl_url = edocman(urljoin(ebook.url, obj['href'])) + harvested, made = make_dl_ebook(dl_url, ebook, + user_agent=requests.utils.default_user_agent()) + if made == 0: + logger.warning('couldn\'t get any dl_url for %s or %s', ebook.url, dl_url) + return harvested, made + + +def harvest_bloomsbury(ebook): + doc = get_soup(ebook.url, follow_redirects=True) + if doc: + pdflinks = [] + try: + base = doc.find('base')['href'] + except: + base = ebook.url + for obj in doc.select('li.pdf-chapter--title a[href]'): + if obj: + chap = urljoin(base, obj['href']) + '.pdf?dl' + pdflinks.append(chap) + stapled = None + if pdflinks: + stapled = make_stapled_ebook(pdflinks, ebook, strip_covers=True) + if stapled: + return stapled + else: + logger.warning('couldn\'t staple %s', pdflinks) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_athabasca(ebook): + def selector(doc): + return doc.select_one('li.downloadPDF a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_usu(ebook): + def selector(doc): + return doc.select_one('#full-text a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_fahce(ebook): + def selector(doc): + return doc.select_one('div.pub_format_single a[href]') + return harvest_one_generic(ebook, selector) + + +def get_meta(doc, term): + obj = doc.find('meta', attrs={"name": term}) + if obj: + return obj.get('content', None) + else: + logger.warning(f'no meta for {term}') + + +BAD_CERTS = { + 'ebooks.marilia.unesp.br', + 'editorial.ucatolicaluisamigo.edu.co', + 'libri.unimi.it', + 'monografias.editorial.upv.es', + 'openpress.mtsu.edu', +} + +def harvest_cmp(ebook): + def selector(doc): + citation_pdf_url = get_meta(doc, "citation_pdf_url") + citation_epub_url = get_meta(doc, "citation_epub_url") + if citation_pdf_url or citation_epub_url: + if citation_pdf_url: + yield {'href': citation_pdf_url} + if citation_epub_url: + yield {'href': citation_epub_url} + else: + found = False + for obj in doc.select('div.entry_details a.cmp_download_link[href]'): + found = True + yield obj + + if not found: + objs = doc.select('.chapters a.cmp_download_link[href], .files a.cmp_download_link[href]') + if (len({obj['href'] for obj in objs})) > 1: + return [] + return doc.select('a.cmp_download_link[href]') + + def dl(url): + return url.replace('view', 'download') + '?inline=1' + + verify = ebook.provider not in BAD_CERTS + if '/view/' in ebook.url: + (ebf, num) = make_dl_ebook(dl(ebook.url), ebook, verify=verify) + if num > 0: + return (ebf, num) + return harvest_multiple_generic(ebook, selector, dl=dl, verify=verify) + + +DSPACEPDF = re.compile(r'/bitstream/.*\.(pdf|epub)') +def harvest_dspace(ebook): + def selector(doc): + return doc.find_all(href=DSPACEPDF) + return harvest_multiple_generic(ebook, selector) + + +def harvest_dspace2(ebook): + doc = get_soup(ebook.url) + if doc: + citation_pdf_url = get_meta(doc, "citation_pdf_url") + if citation_pdf_url: + dl_url = urljoin(ebook.url, citation_pdf_url) + dl_url = dl_url.replace('http://', 'https://') + return make_dl_ebook(dl_url, ebook) + else: + logger.warning('couldn\'t get dl_url for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +# won't harvest page-image books +def harvest_unt(ebook): + def selector(doc): + return doc.select_one('#link-pdf-version[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_mdpi(ebook): + def selector(doc): + return doc.select_one('div.main-download-container a[alt=download]') + if 'http://books.mdpi.com' in ebook.url: + ebook.delete() + return None, 0 + elif 'img.mdpi.org' in ebook.url: + return harvest_generic(ebook) + elif re.search(r'mdpi\.com/books/pdfview/(book|topic)/', ebook.url): + return harvest_citation_meta_generic(ebook) + return harvest_one_generic(ebook, selector) + + +def harvest_idunn(ebook): + if '/doi/book/' in ebook.url: + return harvest_manual(ebook) + + doc = get_soup(ebook.url) + if doc: + obj = doc.select_one('#accessinfo[data-product-id]') + if obj: + pdf_id = obj.get('data-pdf-id', '') + prod_id = obj.get('data-product-id', '') + filename = obj.get('data-issue-pdf-url', ebook.url[:21]) + dl_url = 'https://www.idunn.no/file/pdf/%s/%s.pdf' % (pdf_id, filename) + ebf, num = make_dl_ebook(dl_url, ebook) + if ebf: + return ebf, num + dl_url = 'https://www.idunn.no/file/pdf/%s/%s.pdf' % (prod_id, filename) + return make_dl_ebook(dl_url, ebook) + return None, 0 + +# some failures are caused by +def harvest_calgary(ebook): + def selector(doc): + # some failures are caused by a fulltext link that points to another html page + return doc.find('a', string=re.compile('Full Text')) + def chap_selector(doc): + return doc.find_all('a', href=re.compile('/bitstream/.+\.pdf')) + return harvest_stapled_generic(ebook, selector, chap_selector, + user_agent=settings.CHROME_UA, strip_covers=2) + + +def harvest_muse(ebook): + def selector(doc): + return doc.select('a.btn_download_full[href]') + def chap_selector(doc): + return doc.find_all('a', href=re.compile(r'/chapter/\d+/pdf')) + harvested, made = harvest_multiple_generic(ebook, selector) + if harvested: + return harvested, made + return harvest_stapled_generic(ebook, None, chap_selector, strip_covers=1) + + +def harvest_mitpress(ebook): + def selector(doc): + return doc.select('a.book-pdfLink[href]') + def chap_selector(doc): + return doc.select('a.section-pdfLink[href]') + return harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0) + + +def harvest_ios(ebook): + booknum = None + doc = get_soup(ebook.url) + if doc: + obj = doc.find('link', rel='image_src', href=True) + if obj: + booknum = obj['href'].replace('http://ebooks.iospress.nl/Cover/', '') + if booknum: + dl_url = 'http://ebooks.iospress.nl/Download/Pdf?id=%s' % booknum + return make_dl_ebook(dl_url, ebook, method='POST') + else: + logger.warning('couldn\'t get booknum for %s', ebook.url) + else: + logger.warning('couldn\'t get link for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_elgar(ebook): + if 'display' in ebook.url: + url = ebook.url.replace('display', 'downloadpdf')[:-3] + 'pdf' + elif 'monobook-oa' in ebook.url: + url = ebook.url.replace('monobook-oa', 'downloadpdf')[:-3] + 'pdf' + elif 'edcollbook-oa' in ebook.url: + url = ebook.url.replace('edcollbook-oa', 'downloadpdf')[:-3] + 'pdf' + else: + return None, 0 + return make_dl_ebook(url, ebook, user_agent=settings.GOOGLEBOT_UA) + + +def harvest_wsp(ebook): + idmatch = re.search(r'1142/(\d+)', ebook.url) + if idmatch: + url = 'https://www.worldscientific.com/doi/pdf/10.1142/%s?download=true' % idmatch.group(1) + return make_dl_ebook(url, ebook, user_agent=settings.CHROME_UA) + return None, 0 + + +def harvest_mprl(ebook): + def selector(doc): + return doc.select('a.ml-20[href]') + return harvest_multiple_generic(ebook, selector) + + +def harvest_rti(ebook): + def selector(doc): + return doc.find('a', href=re.compile('fulltext.pdf')) + return harvest_one_generic(ebook, selector) + + +def harvest_unibas(ebook): + def selector(doc): + return doc.select_one('a.ep_document_link[href]') + return harvest_one_generic(ebook, selector) + + +PENSOFT = re.compile(r'/book/(\d+)/list/') +def harvest_pensoft(ebook): + if ebook.id == 263395: + book_id = '12847' + elif ebook.url.startswith('https://books.pensoft.net/books/'): + book_id = ebook.url[32:] + elif PENSOFT.search(ebook.url): + book_id = PENSOFT.search(ebook.url).group(1) + else: + return None, 0 + r = requests.get('https://books.pensoft.net/api/books/' + book_id) + if r.status_code == 200: + try: + file_id = r.json()['data']['item_files'][0]['id'] + return make_dl_ebook('https://books.pensoft.net/api/item_files/%s' % file_id, ebook) + except IndexError: + logger.error('no item_file for %s', ebook.url) + return None, 0 + + +def harvest_edp(ebook): + def selector(doc): + return doc.select_one('a.book-dl[href]') + if ebook.url.endswith('.pdf'): + return harvest_generic(ebook, user_agent=settings.CHROME_UA) + return harvest_one_generic(ebook, selector, user_agent=settings.CHROME_UA) + + +def harvest_edpsciences(ebook): + def selector(doc): + return doc.select_one('.article-open-access-download-cell a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_waxmann(ebook): + ebook.url = ebook.url.replace('http://', 'https://') + if ebook.url.startswith('https://www.waxmann.com/buch'): + return make_dl_ebook(ebook.url.replace('buch', 'index.php?eID=download&buchnr='), ebook) + return None, 0 + + +def harvest_tecnum(ebook): + if ebook.url.startswith('https://doi.org/10.5771/'): + url = 'https://www.tectum-elibrary.de/10.5771/' + ebook.url[24:] + '-I.pdf' + return make_dl_ebook(url, ebook) + return None, 0 + + +def harvest_ojs(ebook): + def selector(doc): + return doc.select('#articleFullText a[href]') + def dl(url): + return url.replace('view', 'download') + '?inline=1' + return harvest_multiple_generic(ebook, selector, dl=dl) + + +def harvest_topoi(ebook): + def selector(doc): + return doc.select_one('li.pdf a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_meson(ebook): + def selector(doc): + for btn in doc.select('a[href] btn.btn-openaccess'): + yield btn.parent + return harvest_multiple_generic(ebook, selector) + + +def harvest_brill(ebook): + r = requests.get(ebook.url, headers={'User-Agent': settings.GOOGLEBOT_UA}) + if r.url.startswith('https://brill.com/view/title/'): + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + elif r.url.startswith('https://brill.com/display/title/'): + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[32:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + elif r.url.startswith('https://brill.com/edcollbook-oa/title/'): + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[38:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + return None, 0 + + +def harvest_doi(ebook): + # usually a 404. + ebook, status = redirect_ebook(ebook) + if status == -2: + return None, -1 + return None, 0 + + +def harvest_doi_coaccess(ebook): + # make a new ebook for the "main pub" and ignore the "related pub" + if ebook.url.startswith('https://doi.org/'): + api_url = 'https://apps.crossref.org/search/coaccess?doi=%s' % quote( + ebook.url[16:], safe='') + r = requests.get(api_url) + if r.status_code == 200: + data = r.json() + url = data.get('url', '') + if not url: + return None, 0 + if models.Ebook.objects.exclude(id=ebook.id).filter(url=url).exists(): + # already taken care of + return set_bookshop(ebook) + + # a new ebook + format = loader.type_for_url(url) + if format in ('pdf', 'epub', 'html', 'online'): + new_ebook = models.Ebook() + new_ebook.format = format + new_ebook.url = url + new_ebook.rights = ebook.rights + new_ebook.edition = ebook.edition + new_ebook.set_provider() + if format == "online": + new_ebook.active = False + new_ebook.save() + set_bookshop(ebook) + if format in DOWNLOADABLE: + return make_dl_ebook(url, ebook) + return None, 0 + + +GUID = re.compile(r'FBInit\.GUID = \"([0-9a-z]+)\"') +LIBROSID = re.compile(r'(\d+)$') +LIBROSROOT = 'https://libros.uchile.cl/files/presses/1/monographs/%s/submission/proof/' +LIBROSINDEX = LIBROSROOT + 'index.html' +LIBROSJSON = LIBROSROOT + 'files/assets/html/workspace.js?uni=%s' +LIBRODPDF = LIBROSROOT + 'files/assets/common/downloads/%s?uni=%s' + +def harvest_libroschile(ebook): + booknum = LIBROSID.search(ebook.url).group(1) + if not booknum: + return None, 0 + viewurl = LIBROSINDEX % booknum + doc = get_soup(viewurl) + if not doc: + return None, 0 + hit = doc.find(string=GUID) + if not hit: + return None, 0 + guid = GUID.search(hit) + if not guid: + return None, 0 + jsonurl = LIBROSJSON % (booknum, guid) + try: + json = requests.get(jsonurl).json() + except: + return None, 0 + if not json: + return None, 0 + filename = json.get('downloads',{}).get('url', None) + if not filename: + return None, 0 + pdfurl = LIBRODPDF % (booknum, filename, guid) + return make_dl_ebook(pdfurl, ebook) + + +def harvest_ipsflab(ebook): + def selector(doc): + return doc.find_all('a', href=re.compile(r'/system/files/ispf_lab/quaderni/.*\.(pdf|epub)')) + return harvest_multiple_generic(ebook, selector) + + +def harvest_figshare(ebook): + def selector(doc): + return doc.find('a', href=re.compile(r'/ndownloader/')) + return harvest_one_generic(ebook, selector) + + +def harvest_fupress(ebook): + def selector(doc): + return doc.select_one('#ctl00_contenuto_pdf a.btn-open[href]') + if 'isbn' in ebook.url: + set_bookshop(ebook) + return None, 0 + return harvest_one_generic(ebook, selector) + + +def harvest_funlam(ebook): + if '/modules/' in ebook.url: + set_bookshop(ebook) + return None, 0 + return make_dl_ebook(ebook.url, ebook) + + +def harvest_dunckerhumblot(ebook): + def selector(doc): + return doc.select_one('div.section__buttons a[href$="download"]') + return harvest_one_generic(ebook, selector) + + +def harvest_cornellopen(ebook): + def selector(doc): + return doc.select('div.sp-product__buy-btn-container li a[href]') + return harvest_multiple_generic(ebook, selector) + + +def harvest_editorialbonaventuriana(ebook): + def selector(doc): + return doc.select_one('div.djc_fulltext p a[href$=".pdf"]') + return harvest_one_generic(ebook, selector) + + +def harvest_esv(ebook): + doc = get_soup(ebook.url.replace('details', 'download')) + if doc: + obj = doc.select_one('div.content-box a[href$=".pdf"]') + if obj: + return make_dl_ebook(obj['href'], ebook) + else: + logger.warning('couldn\'t get link for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_fulcrum(ebook): + def selector(doc): + return doc.select('ul.monograph-catalog-rep-downloads a[href]') + return harvest_multiple_generic(ebook, selector) + + +def harvest_ubiquity(ebook): + def selector(doc): + return doc.find_all('a', attrs={'data-category': re.compile('(epub|pdf) download')}) + return harvest_multiple_generic(ebook, selector) + + +def harvest_orkana(ebook): + def selector(doc): + for obj in doc.find_all('p', string=re.compile(r'\((PDF|E-BOK)\)')): + div = obj.find_parent('div') + if div and div.find_next_sibling('div') and div.find_next_sibling('div').find('a'): + yield div.find_next_sibling('div').find('a') + return harvest_multiple_generic(ebook, selector) + + +def harvest_euna(ebook): + if '/view/' in ebook.url: + return make_dl_ebook(ebook.url.replace('view', 'download'), ebook) + set_bookshop(ebook) + return None, 0 + + +def harvest_orl(ebook): + if ebook.url.startswith('https://openresearchlibrary.org/viewer/'): + orl_id = ebook.url[39:] + return make_dl_ebook( + f'https://openresearchlibrary.org/ext/api/media/{orl_id}/assets/external_content.pdf', + ebook) + return None, 0 + + +def harvest_pressesagro(ebook): + def selector(doc): + return doc.select_one('#sidebar ul li span a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_buponline(ebook): + def selector(doc): + return doc.find('a', string=DOWNLOAD) + return harvest_one_generic(ebook, selector) + + +INTECH = re.compile(r'\.intechopen\.com/books/(\d+)$') +def harvest_intech(ebook): + booknum = INTECH.search(ebook.url) + if booknum: + url = (f'https://mts.intechopen.com/storage/books/{booknum.group(1)}/authors_book/authors_book.pdf') + return make_dl_ebook(url, ebook) + return None, 0 + + +def harvest_usmcu(ebook): + def selector(doc): + return doc.find('a', string='PDF download') + return harvest_one_generic(ebook, selector) + + +def harvest_upv(ebook): + def selector(doc): + return doc.select_one('a.descargar[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_una_editions(ebook): + doc = get_soup(ebook.url) + if doc: + obj = doc.find('a', class_='jet-listing-dynamic-link__link', href=True, string='PDF') + if obj: + return make_dl_ebook(obj['href'], ebook) + else: + logger.warning('couldn\'t get link for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_cambridge(ebook): + ebook, status = redirect_ebook(ebook) + doc = get_soup(ebook.url) + if doc: + obj = doc.find('a', string=re.compile('Full book PDF')) + if obj and obj['href']: + dl_url = urljoin(ebook.url, obj['href']) + return make_dl_ebook(dl_url, ebook) + obj = doc.find('meta', attrs={"name": re.compile("citation_pdf_url")}) + if obj and obj['content']: + dl_url = obj['content'] + return make_dl_ebook(dl_url, ebook) + pdflinks = [] + for obj in doc.select('a[data-pdf-content-id]'): + if obj and obj['href']: + chap = urljoin(ebook.url, obj['href']) + pdflinks.append(chap) + stapled = None + if pdflinks: + stapled = make_stapled_ebook(pdflinks, ebook) + if stapled: + return stapled + else: + logger.warning('couldn\'t staple %s', pdflinks) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_exon(ebook): + doc = get_soup(ebook.url) + if doc: + pdflinks = [] + for obj in doc.select('a.galley-link.pdf[href]'): + if obj and obj['href']: + chap = obj['href'].replace('/view/', '/download/') + pdflinks.append(chap) + stapled = None + if pdflinks: + stapled = make_stapled_ebook(pdflinks, ebook) + if stapled: + return stapled + else: + logger.warning('couldn\'t staple %s', pdflinks) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + +def harvest_una(ebook): + def selector(doc): + return doc.select_one('#header-primary-action a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_wbg(ebook): + ''' most of these are archived under files.wbg-wissenverbindet.de ''' + doc = get_soup(ebook.url) + if doc: + sku_obj = doc.select_one('span[itemprop=sku]') + sku = sku_obj.text.strip() if sku_obj else None + if sku: + url = f'https://files.wbg-wissenverbindet.de/Files/Article/ARTK_ZOA_{sku}_0001.pdf' + return make_dl_ebook(url, ebook) + return None, 0 + + +def harvest_kb(ebook): + def selector(doc): + return doc.select_one('a[title=fulltext][href]') + return harvest_one_generic(ebook, selector) + +def harvest_tabedizioni(ebook): + def selector(doc): + return doc.find(href=re.compile("/web/content/")) + return harvest_one_generic(ebook, selector) + + +def harvest_istanbul(ebook): + def cdn_url(soup): + objs = soup.find_all('a', href=re.compile(r'cdn\.istanbul')) + for obj in objs: + yield obj['href'] + def pdf_urls(ebook): + doc = get_soup(ebook.url, user_agent=settings.GOOGLEBOT_UA, follow_redirects=True) + if doc: + for content_url in cdn_url(doc): + yield content_url + for obj in doc.select('div.post-content h5 a.from-journal[href]'): + chap_url = urljoin(ebook.url, obj['href']) + chap_doc = get_soup(chap_url, user_agent=settings.GOOGLEBOT_UA, follow_redirects=True) + if chap_doc: + for content_url in cdn_url(chap_doc): + yield content_url + + # staple the chapters + stapled = make_stapled_ebook(pdf_urls(ebook), ebook, user_agent=settings.GOOGLEBOT_UA) + if stapled: + return stapled + else: + logger.warning('couldn\'t make ebook file for %s', ebook.url) + return None, 0 + + +def harvest_gta(ebook): + # https://verlag.gta.arch.ethz.ch/en/gta:book_978-3-85676-393-0 + pos = ebook.url.find('_') + if pos < 1: + return None, 0 + isbn = ebook.url[pos + 1:] + api_host = 'https://api.verlag.gta.arch.ethz.ch' + json_url = f'{api_host}/api/v1/graphs/gta/data/gtaapi:PublicRetrieveBook/gta:book_{isbn}/' + r = requests.get(json_url) + if r.status_code == 200: + try: + file_url = None + graph = r.json()['@graph'] + for obj in graph: + if "gtaapi:file_url" in obj: + file_url = obj["gtaapi:file_url"] + break + if file_url: + return make_dl_ebook(file_url, ebook) + except IndexError: + logger.error('no item_file for %s', ebook.url) + return None, 0 + + +def harvest_manu(ebook): + def chap_selector(doc): + return doc.select('div.content-box-body div.book-toc a.c-Button--link[href*="/display/"]') + def dl(url): + return url.replace('/display/', '/downloadpdf/').replace('.xml', '.pdf') + doc = get_soup(ebook.url, follow_redirects=True, user_agent=settings.CHROME_UA) + if doc: + obj = doc.find('a', string=re.compile(r"Open Access")) + if not obj or 'href' not in obj.attrs: + return None, 0 + ebook.url = urljoin(ebook.url, obj['href']) + return harvest_stapled_generic(ebook, lambda x: None, chap_selector, + user_agent=settings.CHROME_UA, dl=dl) + return None, 0 + + +def harvest_sciendo(ebook): + def selector(doc): + json_obj = doc.find('script', id='__NEXT_DATA__') + if json_obj: + try: + json_data = json.loads(json_obj.string) + pdf_url = json_data['props']['pageProps']['product']['pdfLink'] + epub_url = json_data['props']['pageProps']['product']['epubLink'] + if pdf_url or epub_url: + if pdf_url: + yield {'href': pdf_url} + if epub_url: + yield {'href': epub_url} + except json.JSONDecodeError as je: + logger.error(f'Bad json {je.msg}') + except KeyError as ke: + logger.error('No links in json for {ebook.url}') + return harvest_multiple_generic(ebook, selector) + +# 2step +def harvest_liege(ebook): + def selector(doc): + urls = [] + pages = doc.find_all('a', href=re.compile(r'/(front|back)-matter/')) + for page in pages: + page_doc = get_soup(page['href'], follow_redirects=True, user_agent=settings.USER_AGENT) + if page_doc: + links = page_doc.find_all('a', href=re.compile(r'orbi\.uliege\.be/(bitstream|handle)/')) + for link in links: + if link['href'] not in urls: + urls.append(link['href']) + pdf = epub = repo = None + for content_url in urls: + if content_url.lower().endswith('.pdf'): + pdf = pdf or content_url + elif content_url.lower().endswith('.epub'): + epub = epub or content_url + else: + repo = repo or content_url + if pdf and epub: + break + if pdf: + yield {'href': pdf} + if epub: + yield {'href': epub} + if repo and not (pdf or epub): + repo_doc = get_soup(repo, follow_redirects=True, user_agent=settings.USER_AGENT) + if repo_doc: + return repo_doc.find_all(href=DSPACEPDF) + + return harvest_multiple_generic(ebook, selector) + +# 2step +def harvest_benjamins(ebook): + def selector(doc): + urls = [] + page = doc.find('a', href=re.compile(r'jbe-platform.com')) + if page: + base = page['href'] + base_doc = get_soup(base, follow_redirects=True) + if base_doc: + links = base_doc.select('.access-options a[href]') + for link in links: + dl_url = urljoin(base, link['href']) + yield {'href': dl_url} + return harvest_multiple_generic(ebook, selector) + +def harvest_citation_meta_generic(ebook): + def selector(doc): + citation_pdf_url = get_meta(doc, "citation_pdf_url") + citation_epub_url = get_meta(doc, "citation_epub_url") + if citation_pdf_url or citation_epub_url: + if citation_pdf_url: + yield {'href': citation_pdf_url} + if citation_epub_url: + yield {'href': citation_epub_url} + return harvest_multiple_generic(ebook, selector) diff --git a/core/loaders/ku.py b/core/loaders/ku.py new file mode 100644 index 000000000..19a36dfb1 --- /dev/null +++ b/core/loaders/ku.py @@ -0,0 +1,171 @@ +import requests +from bs4 import BeautifulSoup +from django.conf import settings + +from regluit.core.validation import ( + authlist_cleaner, + identifier_cleaner, + valid_subject, + validate_date, +) +from regluit.core.bookloader import add_from_bookdatas +from regluit.core.models import EbookFile +from regluit.core.parameters import DOWNLOADABLE + +from .multiscrape import BaseMultiScraper, multiscrape +from .utils import ids_from_urls + +class KUMultiScraper(BaseMultiScraper): + parser_name = 'xml' + can_scrape_hosts = ['app.knowledgeunlatched.org'] + + @classmethod + def divider(cls, doc): + return doc.find_all('Submission') + + @classmethod + def get_response(cls, url): + return cls.login().get(url) + + @classmethod + def login(cls): + s = requests.Session() + credentials = {'username': settings.KU_EMAIL, 'password': settings.KU_PASSWORD} + r = s.get('https://app.knowledgeunlatched.org/login') + auth_url = BeautifulSoup(r.content, "lxml").find(id='kc-form-login')['action'] + r = s.post(auth_url, data=credentials) + return s + + def get_license(self): + val = self.fetch_one_el_content('LicenseURL') + if val: + self.set('rights_url', val) + + def get_title(self): + val = self.fetch_one_el_content('Title') + if val: + self.set('title', val) + + def get_description(self): + val = self.fetch_one_el_content('Description') + coll = self.doc.select_one('Funder ProgramName') + coll = u"
    This book is made open access as part of the Knowledge Unlatched {}".format(coll.text) if coll else '' + if val: + self.set('description', val + coll) + + def get_genre(self): + val = self.fetch_one_el_content('Type') + if val: + self.set('genre', val) + + def get_language(self): + val = self.fetch_one_el_content('Language') + if val: + self.set('language', val) + + def get_keywords(self): + subjects = [self.fetch_one_el_content('PrimarySubject')] + for subject in self.doc.find_all('ManualSubject'): + subjects.append(subject.text) + bisac = self.fetch_one_el_content('BISAC') + if bisac: + subjects.append((u'bisacsh', bisac)) + subjects.append('KUnlatched') + self.set('subjects', subjects) + + def get_publisher(self): + val = self.fetch_one_el_content('PublisherName') + if val: + self.set('publisher', val) + + def get_cover(self): + image_url = self.fetch_one_el_content('Cover') + if image_url: + self.set('covers', [{'image_url': image_url}]) + + def get_pubdate(self): + value = self.fetch_one_el_content('PublicationDate') + if value: + value = validate_date(value) + if value: + self.set('publication_date', value) + + def get_authors(self): + def fullname(auth): + firstname = auth.FirstName.text + lastname = auth.LastName.text + return u'{} {}'.format(firstname, lastname) + authors = self.doc.find_all('Author') + creator_list = [] + role = 'author' + for author in authors: + creator_list.append({'agent_name': fullname(author)}) + role = author.Role.text + self.set('creator', {'{}s'.format(role): creator_list }) + + def get_downloads(self): + fts = DOWNLOADABLE + dls = self.doc.find_all('Document') + for dl in dls: + dlft = dl.Type.text + url = dl.Path.text + for ft in fts: + if ft in dlft: + dlft = ft + break + if url: + self.set('download_url_{}'.format(dlft), url) + + def get_isbns(self): + isbn_cleaner = identifier_cleaner('isbn', quiet=True) + isbns = {} + isbn = isbn_cleaner(self.fetch_one_el_content('IsbnHardback')) + if isbn: + isbns['isbn_hard'] = isbn + isbn = isbn_cleaner(self.fetch_one_el_content('IsbnPaperback')) + if isbn: + isbns['isbn_paper'] = isbn + isbn = isbn_cleaner(self.fetch_one_el_content('IsbnEpdf')) + if isbn: + isbns['isbn_pdf'] = isbn + isbn = isbn_cleaner(self.fetch_one_el_content('IsbnEpub')) + if isbn: + isbns['isbn_epub'] = isbn + return isbns + + def get_identifiers(self): + doi_cleaner = identifier_cleaner('doi', quiet=True) + super(KUMultiScraper, self).get_identifiers() + url = self.fetch_one_el_content('Doi') + if url: + doi = doi_cleaner(url) + if doi: + self.identifiers['doi'] = doi + url = self.fetch_one_el_content('OAPENURL') + if url: + oapn = ids_from_urls(url).get('oapn', None) + if oapn: + self.identifiers['oapn'] = oapn + +ku_rounds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, + 23, 26, 27, 29, 31, 33, 42, 49, 50, 51, 52] + +def load_ku(ku_round=None): + rounds = [ku_round] if ku_round else ku_rounds + editions = [] + for around in rounds: + ku_url = 'https://app.knowledgeunlatched.org/api/rounds/{}/submissions.xml'.format(around) + scrapers = multiscrape(ku_url, scraper_class=KUMultiScraper) + editions.extend(add_from_bookdatas(scrapers)) + return editions + +def activate_ku_ebooks(): + to_activate = EbookFile.objects.filter( + source__startswith='https://app.knowledgeunlatched.org/uploads/', + ebook__active=False, + ) + num_to_activate = to_activate.count() + for ebf in to_activate: + ebf.ebook.activate() + return num_to_activate + diff --git a/core/loaders/multiscrape.py b/core/loaders/multiscrape.py new file mode 100644 index 000000000..40ce5ea2d --- /dev/null +++ b/core/loaders/multiscrape.py @@ -0,0 +1,102 @@ +import logging +import re +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +import requests + +from django.conf import settings + +from regluit.core.bookloader import add_from_bookdatas +from regluit.core.loaders.scrape import BaseScraper +from regluit.core.validation import identifier_cleaner + +logger = logging.getLogger(__name__) +''' +use for web pages with multiple books +returns an iterator of scrapers +''' + +class BaseMultiScraper(BaseScraper): + parser_name = 'lxml' + def __init__(self, url, doc): + self.metadata = {} + self.identifiers = {} + self.doc = doc + self.base = url + self.get_all() + if not self.metadata.get('title', None): + self.set('title', '!!! missing title !!!') + self.metadata['identifiers'] = self.identifiers + + @classmethod + def login(cls): + return requests + +def multiscrape(url, scraper_class=BaseMultiScraper): + try: + response = scraper_class.get_response(url) + if response.status_code == 200: + doc = BeautifulSoup(response.content, scraper_class.parser_name) + sections = scraper_class.divider(doc) + for section in sections: + yield scraper_class(url, section) + except requests.exceptions.RequestException as e: + logger.error(e) + self.metadata = None + + +# following is code specific to edp-open.org; refactor when we add another + + +ISBNMATCH = re.compile(r'([\d\-]+)') +class EDPMultiScraper(BaseMultiScraper): + @classmethod + def divider(cls, doc): + return doc.select('article.Bk') + + def get_isbns(self): + '''return a dict of edition keys and ISBNs''' + isbns = {} + isbn_cleaner = identifier_cleaner('isbn', quiet=True) + labels = ['epub', 'pdf', 'paper'] + info = self.doc.select_one('p.nfo').text + isbntexts = re.split('ISBN', info) + for isbntext in isbntexts[1:]: + isbnmatch = ISBNMATCH.search(isbntext) + if isbnmatch: + isbn = isbn_cleaner(isbnmatch.group(0)) + isbns[labels.pop()] = isbn + return isbns + + def get_downloads(self): + dl = self.doc.select_one('nav.dl') + links = dl.select('a.fulldl') + for link in links: + href = urljoin(self.base, link['href']) + if href.endswith('.pdf'): + self.set('download_url_pdf', href) + elif href.endswith('.epub'): + self.set('download_url_epub', href) + + def get_language(self): + if 'english' in self.base: + self.set('language', 'en') + else: + self.set('language', 'fr') + + def get_title(self): + value = self.doc.select_one('h2').text + book_id = self.doc.select_one('h2')['id'] + self.identifiers['http'] = u'{}#{}'.format(self.base, book_id) + self.set('title', value) + +def edp_scrape(): + edp_urls = [ + 'https://www.edp-open.org/books-in-french', + 'https://www.edp-open.org/books-in-english', + ] + for url in edp_urls: + scrapers = multiscrape(url, scraper_class=EDPMultiScraper) + add_from_bookdatas(scrapers) + diff --git a/core/loaders/pressbooks.py b/core/loaders/pressbooks.py index 47291e896..0e84b7a88 100644 --- a/core/loaders/pressbooks.py +++ b/core/loaders/pressbooks.py @@ -1,18 +1,26 @@ +import re from regluit.core.validation import identifier_cleaner from . import BaseScraper class PressbooksScraper(BaseScraper): - can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu', - 'press.rebus.community', 'pb.unizin.org'] + can_scrape_hosts = [ + 'bookkernel.com', 'milnepublishing.geneseo.edu', 'press.rebus.community', 'pb.unizin.org', + 'opentext.wsu.edu', 'oer.missouriwestern.edu', 'eskript.ethz.ch', 'opentext.lib.vt.edu', + 'opentextbc.ca', + ] can_scrape_strings = ['pressbooks'] def get_downloads(self): - for dl_type in ['epub', 'mobi', 'pdf']: + for dl_type in ['epub', 'pdf']: download_el = self.doc.select_one('.{}'.format(dl_type)) + value = None if download_el and download_el.find_parent(): value = download_el.find_parent().get('href') - if value: - self.set('download_url_{}'.format(dl_type), value) + else: + a = self.doc.find('a', href=re.compile(r'{}$'.format(dl_type))) + value = a.get('href') if a else None + if value: + self.set('download_url_{}'.format(dl_type), value) def get_publisher(self): value = self.get_dt_dd('Publisher') @@ -22,8 +30,10 @@ def get_publisher(self): if value: self.set('publisher', value) else: - super(PressbooksScraper, self).get_publisher() - + value = self.check_metas(['citation_publisher', 'publisher', r'DC\.Source']) + if value: + self.set('publisher', value) + def get_title(self): value = self.doc.select_one('.entry-title a[title]') value = value['title'] if value else None diff --git a/core/loaders/routledge.py b/core/loaders/routledge.py new file mode 100644 index 000000000..808c7a652 --- /dev/null +++ b/core/loaders/routledge.py @@ -0,0 +1,114 @@ +from __future__ import print_function +import re +import requests +from bs4 import BeautifulSoup +from django.conf import settings + +from regluit.core.bookloader import add_from_bookdatas + +from .scrape import BaseScraper + +isbnmatch = re.compile(r'\d{13}') +readbook = re.compile('Read Book') + +class RoutledgeScraper(BaseScraper): + can_scrape_hosts = ['www.routledge.com'] + + def get_keywords(self): + subjects = [] + for sub in self.doc.select('dl.dl-codes dt'): + subjects.append(('bisacsh', sub.string)) + self.set('subjects', subjects) + + def get_author_list(self): + value_list = [] + for auth in self.doc.select('h4.media-author a'): + value_list.append(auth.string) + return value_list + + def get_role(self): + return 'editor' if self.doc.find(string="Edited by ") else 'author' + + def get_isbns(self): + '''return a dict of edition keys and ISBNs''' + def get_isbn(url): + match = isbnmatch.search(url) + if match: + return match.group(0) + + def get_eisbn(eurl): + response = requests.get(eurl, allow_redirects=False) + if response.status_code in (301, 302): + eurl = response.headers['Location'] + return get_isbn(eurl) + + isbns = super(RoutledgeScraper, self).get_isbns() + readbookstr = self.doc.find(string=readbook) + if readbookstr: + eurl = readbookstr.find_parent()['href'] + eisbn = get_eisbn(eurl) + if eisbn: + isbns['ebook'] = eisbn + return isbns + + def get_description(self): + value = self.get_itemprop('description', list_mode='one_item') + if not value: + value = self.check_metas([ + r'dc\.description', + 'og:description', + 'description' + ]) + self.set('description', value) + + def get_publisher(self): + self.set('publisher', "Routledge") + + def get_title(self): + value = self.check_metas([r'dc\.title', 'citation_title', 'og:title', 'title']) + if not value: + value = self.fetch_one_el_content('title') + to_delete = ["(Open Access)", "(Hardback)", "- Routledge"] + for text in to_delete: + value = value.replace(text, "") + self.set('title', value) + + +def load_routledge(): + search_url = "https://www.routledge.com/collections/11526" + + def get_collections(url): + try: + response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) + if response.status_code == 200: + doc = BeautifulSoup(response.content, 'lxml') + for link in doc.find_all('a', href=re.compile('collections/11526/')): + yield (link.text, "https://www.routledge.com/" + link['href']) + except requests.exceptions.ConnectionError: + print('couldn\'t connect to %s' % search_url) + + def get_coll_books(url): + try: + response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) + if response.status_code == 200: + doc = BeautifulSoup(response.content, 'lxml') + for link in doc.select('.media-title a'): + yield link['href'] + except requests.exceptions.ConnectionError: + print('couldn\'t connect to %s' % url) + + books = {} + for (subject, coll_url) in get_collections(search_url): + print(subject) + for book_url in get_coll_books(coll_url): + if not book_url in books: + print(book_url) + new_book = RoutledgeScraper(book_url) + new_book.metadata['subjects'].append(subject) + books[book_url] = new_book + else: + books[book_url].metadata['subjects'].append(subject) + print("Harvesting %s books" % len(list[books.values()])) + add_from_bookdatas(books.values()) + return books + \ No newline at end of file diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index 04a40e708..863d92714 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -1,14 +1,18 @@ import re import logging -from urlparse import urlparse +from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup #from gitenberg.metadata.pandata import Pandata from django.conf import settings -from urlparse import urljoin from regluit.core import models -from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date +from regluit.core.validation import ( + authlist_cleaner, + identifier_cleaner, + valid_subject, + validate_date, +) logger = logging.getLogger(__name__) @@ -22,6 +26,8 @@ class BaseScraper(object): ''' can_scrape_hosts = False can_scrape_strings = False + parser_name = 'lxml' + @classmethod def can_scrape(cls, url): ''' return True if the class can scrape the URL ''' @@ -39,46 +45,47 @@ def can_scrape(cls, url): return True return False - def __init__(self, url): - self.metadata = {} + @classmethod + def get_response(cls, url): + try: + return requests.get(url, headers={"User-Agent": settings.USER_AGENT}) + except requests.exceptions.RequestException as e: + logger.error(e) + + def __init__(self, url, initial={}): + self.metadata = initial self.identifiers = {'http': url} self.doc = None self.base = url - try: - response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) + response = type(self).get_response(url) + if response: if response.status_code == 200: self.base = response.url - self.doc = BeautifulSoup(response.content, 'lxml') + self.doc = BeautifulSoup(response.content, self.parser_name) for review in self.doc.find_all(itemtype="http://schema.org/Review"): review.clear() - self.setup() - self.get_genre() - self.get_title() - self.get_language() - self.get_description() - self.get_identifiers() - self.get_keywords() - self.get_publisher() - self.get_pubdate() - self.get_authors() - self.get_cover() - self.get_downloads() - self.get_license() - if not self.metadata.get('title', None): - self.set('title', '!!! missing title !!!') - if not self.metadata.get('language', None): - self.set('language', 'en') - except requests.exceptions.RequestException as e: - logger.error(e) - self.metadata = {} - self.metadata['identifiers'] = self.identifiers + self.get_all() + if not self.metadata.get('title', None): + self.set('title', '!!! missing title !!!') + if not self.metadata.get('language', None): + self.set('language', 'en') + self.metadata['identifiers'] = self.identifiers + else: + self.metadata = None + else: + self.metadata = None + # # utilities # def set(self, name, value): - self.metadata[name] = value + if isinstance(value, str): + value= value.strip() + if value or name not in self.metadata: + self.metadata[name] = value + def fetch_one_el_content(self, el_name): data_el = self.doc.find(el_name) @@ -122,7 +129,7 @@ def get_dt_dd(self, name): ''' get the content of
    after a
    containing name''' dt = self.doc.find('dt', string=re.compile(name)) dd = dt.find_next_sibling('dd') if dt else None - return dd.text if dd else None + return dd.text.strip() if dd and dd.text else None def get_itemprop(self, name, **attrs): value_list = [] @@ -137,13 +144,33 @@ def get_itemprop(self, name, **attrs): else: if el.text: value_list.append(el.text) - elif el.has_key('content'): + elif 'content' in el: value_list.append(el['content']) return value_list + + def get_all(self): + self.setup() + self.get_genre() + self.get_title() + self.get_language() + self.get_description() + self.get_identifiers() + self.get_keywords() + self.get_publisher() + self.get_pubdate() + self.get_authors() + self.get_cover() + self.get_downloads() + self.get_license() + self.cleanup() def setup(self): # use this method to get auxiliary resources based on doc pass + + def cleanup(self): + # use this method to process collected data + pass # # getters # @@ -175,8 +202,7 @@ def get_isbns(self): '''return a dict of edition keys and ISBNs''' isbns = {} isbn_cleaner = identifier_cleaner('isbn', quiet=True) - label_map = {'epub': 'EPUB', 'mobi': 'Mobi', - 'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'} + label_map = {'epub': 'EPUB', 'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'} for key in label_map.keys(): isbn_key = 'isbn_{}'.format(key) value = self.check_metas(['citation_isbn'], type=label_map[key]) @@ -195,7 +221,7 @@ def get_isbns(self): def get_identifiers(self): value = self.check_metas([r'DC\.Identifier\.URI']) if not value: - value = self.doc.select_one('link[rel=canonical]') + value = self.doc.select_one('link[rel=canonical][href]') value = value['href'] if value else None value = identifier_cleaner('http', quiet=True)(value) if value: @@ -241,7 +267,11 @@ def get_identifiers(self): def get_keywords(self): value = self.check_metas(['keywords']).strip(',;') if value: - self.set('subjects', re.split(' *[;,] *', value)) + subjects = [] + for subject in re.split(' *[;,] *', value): + if valid_subject(subject): + subjects.append(subject) + self.set('subjects', subjects) def get_publisher(self): value = self.check_metas(['citation_publisher', r'DC\.Source']) @@ -252,8 +282,8 @@ def get_pubdate(self): value = self.get_itemprop('datePublished', list_mode='one_item') if not value: value = self.check_metas([ - 'citation_publication_date', r'DC\.Date\.issued', 'datePublished', - 'books:release_date', 'book:release_date' + 'citation_publication_date', 'copyrightYear', r'DC\.Date\.issued', 'datePublished', + 'books:release_date', 'book:release_date', ]) if value: value = validate_date(value) @@ -304,7 +334,7 @@ def get_cover(self): self.set('covers', [{'image_url': image_url}]) def get_downloads(self): - for dl_type in ['epub', 'mobi', 'pdf']: + for dl_type in ['epub', 'pdf']: dl_meta = 'citation_{}_url'.format(dl_type) value = self.check_metas([dl_meta]) if value: diff --git a/core/loaders/smashwords.py b/core/loaders/smashwords.py index 3c4413ddb..452926126 100644 --- a/core/loaders/smashwords.py +++ b/core/loaders/smashwords.py @@ -1,6 +1,7 @@ import re -from urlparse import urljoin +from urllib.parse import urljoin from regluit.core.loaders.scrape import BaseScraper +from regluit.core.parameters import DOWNLOADABLE SWCAT = re.compile(r'^https://www\.smashwords\.com/books/category.*') class SmashwordsScraper(BaseScraper): @@ -22,7 +23,7 @@ def get_description(self): def get_downloads(self): dldiv = self.doc.select_one('#download') if dldiv: - for dl_type in ['epub', 'mobi', 'pdf']: + for dl_type in DOWNLOADABLE: dl_link = dldiv.find('a', href=re.compile(r'.*\.{}'.format(dl_type))) if dl_link: url = urljoin(self.base,dl_link['href']) diff --git a/core/loaders/soup.py b/core/loaders/soup.py new file mode 100644 index 000000000..bef93c24e --- /dev/null +++ b/core/loaders/soup.py @@ -0,0 +1,36 @@ +import logging + +from bs4 import BeautifulSoup +import requests + +from django.conf import settings + +logger = logging.getLogger(__name__) + +def get_soup(url, user_agent=settings.USER_AGENT, follow_redirects=False, verify=True): + try: + response = requests.get(url, headers={"User-Agent": user_agent}, + allow_redirects=follow_redirects, verify=verify, timeout=(10, 30)) + except requests.exceptions.MissingSchema: + response = requests.get('http://%s' % url, headers={"User-Agent": user_agent}, timeout=(10, 30)) + except requests.exceptions.ConnectionError as e: + logger.error("Connection refused for %s", url) + logger.error(e) + return None + except requests.exceptions.Timeout: + logger.error("Request timed out for %s", url) + return None + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'lxml') + + # make sure document has a base + if not soup.find('base'): + obj = soup.find('head') + if obj: + obj.append(soup.new_tag("base", href=response.url)) + else: + logger.error('No head for %s', url) + return soup + else: + logger.error('%s returned code %s', url, response.status_code) + return None diff --git a/core/loaders/springer.py b/core/loaders/springer.py index c30e80362..cf6dc8c5c 100644 --- a/core/loaders/springer.py +++ b/core/loaders/springer.py @@ -1,5 +1,6 @@ import re -from urlparse import urljoin +import json +from urllib.parse import urljoin import requests from bs4 import BeautifulSoup @@ -8,6 +9,7 @@ from regluit.core.validation import identifier_cleaner from regluit.core.bookloader import add_from_bookdatas +from regluit.core.parameters import DOWNLOADABLE from .scrape import BaseScraper, CONTAINS_CC @@ -16,9 +18,19 @@ class SpringerScraper(BaseScraper): can_scrape_strings =['10.1007', '10.1057'] + + @classmethod + def get_response(cls, url): + try: + return requests.get(url, headers={"User-Agent": settings.CHROME_UA}) + except requests.exceptions.RequestException as e: + logger.error(e) + def get_downloads(self): - for dl_type in ['epub', 'mobi', 'pdf']: + for dl_type in DOWNLOADABLE: download_el = self.doc.find('a', title=re.compile(dl_type.upper())) + if not download_el: + download_el = self.doc.find('a', attrs={f'data-book-{dl_type}': True}) if download_el: value = download_el.get('href') if value: @@ -26,7 +38,9 @@ def get_downloads(self): self.set('download_url_{}'.format(dl_type), value) def get_description(self): - desc = self.doc.select_one('#book-description') + desc = self.doc.find('div', attrs={'data-component': 'data-unique-selling-points'}) + if not desc: + desc = self.doc.select_one('#book-description') if desc: value = '' for div in desc.contents: @@ -35,11 +49,16 @@ def get_description(self): text = text.replace(u'\xa0', u' ') value = u'{}

    {}

    '.format(value, text) self.set('description', value) + else: + super(SpringerScraper, self).get_description() def get_keywords(self): value = [] for kw in self.doc.select('.Keyword'): value.append(kw.text.strip()) + if len(value) == 0: + for kw in self.doc.select('#keywords-content li.c-article-subject-list__subject'): + value.append(kw.text.strip()) if value: if 'Open Access' in value: value.remove('Open Access') @@ -50,8 +69,10 @@ def get_identifiers(self): el = self.doc.select_one('#doi-url') if el: value = identifier_cleaner('doi', quiet=True)(el.text) - if value: - self.identifiers['doi'] = value + else: + value = identifier_cleaner('doi', quiet=True)(self.check_metas(['doi'])) + if value: + self.identifiers['doi'] = value def get_isbns(self): isbns = {} @@ -65,27 +86,52 @@ def get_isbns(self): value = identifier_cleaner('isbn', quiet=True)(el.text) if value: isbns['electronic'] = value + if len(isbns) > 0: + return isbns + data_json = self.doc.find('script', string=re.compile(r'window\.dataLayer =')) + if data_json: + data_json = data_json.text.strip()[18:] + data = json.loads(data_json.strip(';')) + content = data[0].get('content', None) + if content: + content = content.get('book', None) + if content: + value = identifier_cleaner('isbn', quiet=True)(content.get("pisbn", '')) + if value: + isbns['paper'] = value + value = identifier_cleaner('isbn', quiet=True)(content.get("eisbn", '')) + if value: + isbns['electronic'] = value return isbns + def get_title(self): el = self.doc.select_one('#book-title') value = '' if el: value = el.text.strip() - if value: - value = value.replace('\n', ': ', 1) - self.set('title', value) - if not value: + else: + el = self.doc.select_one('.page-title') + if el: + value = el.text.strip() + if value: + value = value.replace('\n', ': ', 1) + self.set('title', value) + else: super(SpringerScraper, self).get_title() def get_role(self): if self.doc.select_one('#editors'): return 'editor' + if self.doc.find('ul', atts={'data-list-type':"editors"}): + return 'editor' return 'author' - def get_author_list(self): + def get_author_list(self): for el in self.doc.select('.authors__name'): yield el.text.strip().replace(u'\xa0', u' ') + for el in self.doc.select('.c-article-author-list__item'): + yield el.text.strip(', ').replace(u'\xa0', u' ') def get_license(self): '''only looks for cc licenses''' @@ -100,11 +146,16 @@ def get_license(self): self.set('rights_url', lic_url) def get_pubdate(self): - pubinfo = self.doc.select_one('#copyright-info') + pubinfo = self.doc.find(attrs={"data-test": "electronic_isbn_publication_date"}) + if not pubinfo: + pubinfo = self.doc.find(attrs={"data-test": "softcover_isbn_publication_date"}) if pubinfo: - yearmatch = HAS_YEAR.search(pubinfo.string) - if yearmatch: - self.set('publication_date', yearmatch.group(0)) + for yearstring in pubinfo.stripped_strings: + yearmatch = HAS_YEAR.search(yearstring) + if yearmatch: + self.set('publication_date', yearmatch.group(0)) + return + def get_publisher(self): self.set('publisher', 'Springer') @@ -116,7 +167,7 @@ def springer_open_books(startpage, endpage): for page in range(startpage, endpage + 1): url = search_url.format(page) try: - response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) + response = requests.get(url, headers={"User-Agent": settings.CHROME_UA}) if response.status_code == 200: base = response.url doc = BeautifulSoup(response.content, 'lxml') @@ -124,5 +175,5 @@ def springer_open_books(startpage, endpage): book_url = urljoin(base, link['href']) yield SpringerScraper(book_url) except requests.exceptions.ConnectionError: - print 'couldn\'t connect to %s' % url + print('couldn\'t connect to %s' % url) return add_from_bookdatas(springer_open_books(startpage, endpage)) diff --git a/core/loaders/tests.py b/core/loaders/tests.py index f94e1ad30..83ecd8174 100644 --- a/core/loaders/tests.py +++ b/core/loaders/tests.py @@ -1,7 +1,7 @@ from django.conf import settings from django.test import TestCase from regluit.core.models import Ebook, Edition, Work -from .utils import dl_online +from .harvest import dl_online class LoaderTests(TestCase): def setUp(self): @@ -17,12 +17,13 @@ def test_downloads(self): edition = Edition(work=work) edition.save() - dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0' + dropbox_url = 'https://www.dropbox.com/s/azaztyvgf6b98bc/stellar-consensus-protocol.pdf?dl=0' dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition) dropbox_ebf, new_ebf = dl_online(dropbox_ebook) self.assertTrue(dropbox_ebf.ebook.filesize) - jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958' - jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition) + jbe_url = 'https://www.jbe-platform.com/content/books/9789027295958' + jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition, + provider='jbe-platform.com') jbe_ebf, new_ebf = dl_online(jbe_ebook) self.assertTrue(jbe_ebf.ebook.filesize) diff --git a/core/loaders/ubiquity.py b/core/loaders/ubiquity.py index c346cec44..899920d29 100644 --- a/core/loaders/ubiquity.py +++ b/core/loaders/ubiquity.py @@ -1,14 +1,33 @@ import re -from urlparse import urlparse +from urllib.parse import urlparse, urljoin -from regluit.utils.lang import get_language_code +from regluit.core.parameters import DOWNLOADABLE +from regluit.utils.lang import lang_to_language_code from . import BaseScraper HAS_EDS = re.compile(r'\(eds?\.\)') -UBIQUITY_HOSTS = ["ubiquitypress.com", "kriterium.se", "oa.finlit.fi", "humanities-map.net", - "oa.psupress.org", "larcommons.net", "uwestminsterpress.co.uk", "stockholmuniversitypress.se", +UBIQUITY_HOSTS = [ + "humanities-map.net", + "hup.fi", + "iitikship.iiti.ac.in", + "kriterium.se", + "larcommons.net", "luminosoa.org", + "oa.finlit.fi", + "oa.psupress.org", + "press.lse.ac.uk", + "press.sjms.nu", + "publishing.vt.edu", + "publikationer.uka.se", + "stockholmuniversitypress.se", + "ubiquitypress.com", + "universitypress.whiterose.ac.uk", + "utsepress.lib.uts.edu.au", + "uwestminsterpress.co.uk", + "www.cardiffuniversitypress.org", + "www.mwv-open.de", + "www.winchesteruniversitypress.org", ] class UbiquityScraper(BaseScraper): @@ -24,8 +43,15 @@ def get_language(self): langlabel = self.doc.find(string='Language') lang = langlabel.parent.parent.find_next_sibling() if langlabel else '' lang = lang.get_text() if lang else '' - lang = get_language_code(lang) if lang else '' + lang = lang_to_language_code(lang) if lang else '' if lang: self.set('language', lang) else: super(UbiquityScraper, self).get_language() + + def get_downloads(self): + for dl_type in DOWNLOADABLE: + dl_a = self.doc.find('a', attrs={'data-category': '{} download'.format(dl_type)}) + if dl_a and 'href' in dl_a.attrs: + url = urljoin(self.base, dl_a['href'].strip()) + self.set('download_url_{}'.format(dl_type), url) diff --git a/core/loaders/utils.py b/core/loaders/utils.py index f559870d1..41cc94673 100644 --- a/core/loaders/utils.py +++ b/core/loaders/utils.py @@ -3,28 +3,30 @@ import re import time import unicodedata -import urlparse +from urllib.parse import urlparse from bs4 import BeautifulSoup import requests from django.conf import settings -from django.core.files.base import ContentFile + from regluit.api.crosswalks import inv_relator_contrib from regluit.bisac.models import BisacHeading from regluit.core.bookloader import add_by_isbn_from_google, merge_works from regluit.core.isbn import ISBN from regluit.core.models import ( - Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work, + Ebook, Edition, Identifier, Subject, Work, ) +from .soup import get_soup + logger = logging.getLogger(__name__) def UnicodeDictReader(utf8_data, **kwargs): csv_reader = csv.DictReader(utf8_data, **kwargs) for row in csv_reader: - yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()} + yield {key: str(value, 'utf-8') for key, value in row.iteritems()} def utf8_general_ci_norm(s): """ @@ -42,11 +44,6 @@ def utf8_general_ci_norm(s): s1 = unicodedata.normalize('NFD', s) return ''.join(c for c in s1 if not unicodedata.combining(c)).upper() -def get_soup(url): - response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) - if response.status_code == 200: - return BeautifulSoup(response.content, 'lxml') - return None def get_authors(book): authors = [] @@ -92,16 +89,9 @@ def get_subjects(book): bisac = BisacHeading.objects.get(notation=code) subjects.append(bisac) except BisacHeading.DoesNotExist: - logger.warning("Please add BISAC {}".format(code)) + logger.warning("Please add BISAC %s", code) return subjects -def add_subject(subject_name, work, authority=''): - try: - subject = Subject.objects.get(name=subject_name) - except Subject.DoesNotExist: - subject = Subject.objects.create(name=subject_name, authority=authority) - subject.works.add(work) - def get_title(book): title = book.get('FullTitle', '') #UMICH if title: @@ -132,7 +122,7 @@ def get_cover(book): if cover.status_code < 400: return cover_url else: - logger.warning("bad cover: {} for: {}".format(cover_url, url)) + logger.warning("bad cover: %s for: %s", cover_url, url) def get_isbns(book): isbns = [] @@ -199,7 +189,7 @@ def load_from_books(books): Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, - SubjectListMARC, , Book-level DOI, URL, License + SubjectListMARC, , Book-level DOI, URL, License ''' @@ -232,11 +222,19 @@ def load_from_books(books): if url: Identifier.set(type='http', value=url, edition=edition, work=work) + # get language + lang = get_language(book) + lang = lang if lang else 'en' + # make sure each isbn is represented by an Edition # also associate authors, publication date, cover, publisher for isbn in isbns: edition = add_by_isbn_from_google(isbn, work=work) if edition and edition.work != work: + work.language = lang + work.save() + edition.work.language = lang + edition.work.save() work = merge_works(work, edition.work) if not edition: edition = Edition(title=title, work=work) @@ -254,30 +252,28 @@ def load_from_books(books): # possibly replace work.description description = get_description(book) if len(description) > len(work.description): - work.description = description + work.description = description.replace('\r\n', '\n') work.save() # set language - lang = get_language(book) - if lang: - work.language = lang - work.save() + work.language = lang + work.save() # add a bisac subject (and ancestors) to work for bisacsh in get_subjects(book): while bisacsh: - add_subject(bisacsh.full_label, work, authority="bisacsh") + Subject.set_by_name(bisacsh.full_label, work, authority="bisacsh") bisacsh = bisacsh.parent - logging.info(u'loaded work {}'.format(work.title)) + logging.info(u'loaded work %s', work.title) loading_ok = loaded_book_ok(book, work, edition) results.append((book, work, edition)) try: - logger.info(u"{} {} {}\n".format(i, title, loading_ok)) + logger.info(u"%s %s %s\n", i, title, loading_ok) except Exception as e: - logger.info(u"{} {} {}\n".format(i, title, str(e))) + logger.info(u"%s %s %s\n", i, title, str(e)) return results @@ -294,7 +290,7 @@ def loaded_book_ok(book, work, edition): try: url_id = Identifier.objects.get(type='http', value=get_url(book)) if url_id is None: - logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book))) + logger.info("url_id problem: work.id %s, url: %s", work.id, get_url(book)) return False except Exception as e: logger.info(str(e)) @@ -302,8 +298,7 @@ def loaded_book_ok(book, work, edition): # isbns for isbn in isbns: - if Identifier.objects.filter(type='isbn', value=isbn).count() <> 1: - # print ("isbn problem: work.id {}, isbn: {}".format(work.id, isbn)) + if Identifier.objects.filter(type='isbn', value=isbn).count() != 1: return False else: try: @@ -312,9 +307,6 @@ def loaded_book_ok(book, work, edition): logger.info(e) return False - # authors - # print set([ed.name for ed in edition_for_isbn.authors.all()]) - if ( set([utf8_general_ci_norm(author[0]) for author in authors]) != set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()]) @@ -352,142 +344,23 @@ def loaded_book_ok(book, work, edition): ID_URLPATTERNS = { 'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P[a-zA-Z0-9\-_]{12})'), 'olwk': re.compile(r'[\./]openlibrary\.org(?P/works/OL\d{1,8}W)'), - 'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P\d{1,8})'), + 'doab': re.compile(r'([\./]directory\.doabooks\.org/handle/)(?P20\.500\.12854/\d{5,8})'), 'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P\d{1,8})'), 'ltwk': re.compile(r'[\./]librarything\.com/work/(?P\d{1,8})'), 'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P\d{8,12})'), 'doi': re.compile(r'[\./]doi\.org/(?P10\.\d+/\S+)'), 'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P\d{1,6})'), 'glue': re.compile(r'[\./]unglue\.it/work/(?P\d{1,7})'), + 'oapn': re.compile(r'[\./]oapen\.org/download\?.*docid=(?P\d{1,8})'), } def ids_from_urls(url): ids = {} - for ident in ID_URLPATTERNS.keys(): - id_match = ID_URLPATTERNS[ident].search(url) + if not url: + return ids + for ident, pattern in ID_URLPATTERNS.items(): + id_match = pattern.search(url) if id_match: ids[ident] = id_match.group('id') return ids -DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"') - -def dl_online(ebook): - if ebook.format != 'online': - pass - elif ebook.url.find(u'dropbox.com/s/') >= 0: - response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT}) - if response.status_code == 200: - match_dl = DROPBOX_DL.search(response.content) - if match_dl: - return make_dl_ebook(match_dl.group(1), ebook) - else: - logger.warning('couldn\'t get {}'.format(ebook.url)) - else: - logger.warning('couldn\'t get dl for {}'.format(ebook.url)) - - elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0: - doc = get_soup(ebook.url) - if doc: - obj = doc.select_one('div.fulltexticoncontainer-PDF a') - if obj: - dl_url = urlparse.urljoin(ebook.url, obj['href']) - return make_dl_ebook(dl_url, ebook) - else: - logger.warning('couldn\'t get dl_url for {}'.format(ebook.url)) - else: - logger.warning('couldn\'t get soup for {}'.format(ebook.url)) - - return None, False - -def make_dl_ebook(url, ebook): - if EbookFile.objects.filter(source=ebook.url): - return EbookFile.objects.filter(source=ebook.url)[0], False - response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) - if response.status_code == 200: - filesize = int(response.headers.get("Content-Length", 0)) - filesize = filesize if filesize else None - format = type_for_url(url, content_type=response.headers.get('content-type')) - if format != 'online': - new_ebf = EbookFile.objects.create( - edition=ebook.edition, - format=format, - source=ebook.url, - ) - new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content)) - new_ebf.save() - new_ebook = Ebook.objects.create( - edition=ebook.edition, - format=format, - provider='Unglue.it', - url=new_ebf.file.url, - rights=ebook.rights, - filesize=filesize, - version_label=ebook.version_label, - version_iter=ebook.version_iter, - ) - new_ebf.ebook = new_ebook - new_ebf.save() - return new_ebf, True - else: - logger.warning('download format for {} is not ebook'.format(url)) - else: - logger.warning('couldn\'t get {}'.format(url)) - return None, False - -def type_for_url(url, content_type=None): - if not url: - return '' - if url.find('books.openedition.org') >= 0: - return 'online' - if Ebook.objects.filter(url=url): - return Ebook.objects.filter(url=url)[0].format - ct = content_type if content_type else contenttyper.calc_type(url) - if re.search("pdf", ct): - return "pdf" - elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I): - return "pdf" - elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I): - return "epub" - elif re.search("text/plain", ct): - return "text" - elif re.search("text/html", ct): - if url.find('oapen.org/view') >= 0: - return "html" - return "online" - elif re.search("epub", ct): - return "epub" - elif re.search("mobi", ct): - return "mobi" - return "other" - -class ContentTyper(object): - """ """ - def __init__(self): - self.last_call = dict() - - def content_type(self, url): - try: - r = requests.head(url) - return r.headers.get('content-type', '') - except: - return '' - - def calc_type(self, url): - delay = 1 - # is there a delay associated with the url - netloc = urlparse.urlparse(url).netloc - - # wait if necessary - last_call = self.last_call.get(netloc) - if last_call is not None: - now = time.time() - min_time_next_call = last_call + delay - if min_time_next_call > now: - time.sleep(min_time_next_call-now) - - self.last_call[netloc] = time.time() - - # compute the content-type - return self.content_type(url) - -contenttyper = ContentTyper() diff --git a/core/lookups.py b/core/lookups.py index 06c50936d..05ce92e22 100644 --- a/core/lookups.py +++ b/core/lookups.py @@ -2,7 +2,9 @@ from selectable.registry import registry from django.contrib.auth.models import User +from django.db import models from django.db.models import Count + from regluit.core.models import Work, PublisherName, Edition, Subject, EditionNote, Ebook from regluit.utils.text import sanitize_line @@ -80,6 +82,17 @@ def create_item(self, value): new_note.save() return new_note +class Search(models.Lookup): + lookup_name = 'search' + + def as_mysql(self, compiler, connection): + lhs, lhs_params = self.process_lhs(compiler, connection) + rhs, rhs_params = self.process_rhs(compiler, connection) + params = lhs_params + rhs_params + return 'MATCH (%s) AGAINST (%s IN BOOLEAN MODE)' % (lhs, rhs), params + +models.TextField.register_lookup(Search) + registry.register(OwnerLookup) registry.register(WorkLookup) registry.register(PublisherNameLookup) @@ -87,3 +100,4 @@ def create_item(self, value): registry.register(SubjectLookup) registry.register(EditionNoteLookup) registry.register(EbookLookup) + diff --git a/core/management/commands/add_missing_doab_covers.py b/core/management/commands/add_missing_doab_covers.py deleted file mode 100644 index cd30149cd..000000000 --- a/core/management/commands/add_missing_doab_covers.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import print_function -from django.core.management.base import BaseCommand - -from regluit.core.models import Work -from regluit.core.loaders.doab import update_cover_doab - -class Command(BaseCommand): - help = "make covers for doab editions with bad covers" - - def handle(self, **options): - works = Work.objects.filter(identifiers__type='doab').distinct() - print('checking {} works with doab'.format(works.count())) - num = 0 - for work in works: - if not work.cover_image_thumbnail(): - update_cover_doab(work.doab, work.preferred_edition, store_cover=True) - #print(work.doab) - num += 1 - if num % 10 == 0: - print('{} doab covers updated'.format(num)) - #break - print('Done: {} doab covers updated'.format(num)) \ No newline at end of file diff --git a/core/management/commands/add_openlibrary.py b/core/management/commands/add_openlibrary.py index 99fc68b8a..313c98511 100644 --- a/core/management/commands/add_openlibrary.py +++ b/core/management/commands/add_openlibrary.py @@ -6,5 +6,5 @@ class Command(BaseCommand): def handle(self, *args, **options): for work in models.Work.objects.filter(openlibrary_lookup__isnull=True): - print "loading openlibrary data for %s" % work + self.stdout.write("loading openlibrary data for %s" % work) bookloader.add_openlibrary(work) diff --git a/core/management/commands/bisac_to_bisacsh.py b/core/management/commands/bisac_to_bisacsh.py new file mode 100644 index 000000000..fd039df02 --- /dev/null +++ b/core/management/commands/bisac_to_bisacsh.py @@ -0,0 +1,26 @@ +import re +from django.core.management.base import BaseCommand +from regluit.bisac.models import BisacHeading +from regluit.core.models import Subject + + +bisac_pattern = re.compile(r'[A-Z]{3}\d+') + +class Command(BaseCommand): + help = "fix bisac headings" + + def handle(self, **options): + for subject in Subject.objects.filter(name__contains='bisac'): + print(subject.name) + match = bisac_pattern.search(subject.name) + bisac_code = match.group(0) if match else None + if bisac_code: + try: + bisac_heading = BisacHeading.objects.get(notation=bisac_code) + for work in subject.works.all(): + while bisac_heading: + Subject.set_by_name(bisac_heading.full_label, work, authority="bisacsh") + bisac_heading = bisac_heading.parent + subject.delete() + except BisacHeading.DoesNotExist: + self.stdout.write("no Bisac heading with notation %s" % bisac_code) diff --git a/core/management/commands/check_payment_integrity.py b/core/management/commands/check_payment_integrity.py deleted file mode 100644 index 706ff4079..000000000 --- a/core/management/commands/check_payment_integrity.py +++ /dev/null @@ -1,27 +0,0 @@ -from django.core.management.base import BaseCommand -from django.db.models import Q, F - -from regluit.payment.parameters import TRANSACTION_STATUS_ACTIVE -from regluit.core import models - -class Command(BaseCommand): - help = "Do some integrity checks on our Payments" - - def handle(self, **options): - print "number of Campaigns", models.Campaign.objects.count() - print "number of active Campaigns", models.Campaign.objects.filter(status='ACTIVE').count() - for campaign in models.Campaign.objects.filter(status='ACTIVE'): - print stats_for_active_campaign(campaign) - -def stats_for_active_campaign(campaign): - # might need to calculate 'number of users with more than 1 ACTIVE transaction (should be 0)' - # set([t.user for t in c.transaction_set.filter(status='Active')]) - set(userlists.supporting_users(c.work,1000)) - # everyone with an ACTIVE pledge should have the work on his/her wishlist - # set([w.user for w in c.work.wishlists.all()]) - # set([t.user for t in campaign.transaction_set.filter(status=TRANSACTION_STATUS_ACTIVE)]) - set([w.user for w in c.work.wishlists.all()]) - return {'name': campaign.name, - 'work':campaign.work, - 'number of ACTIVE transactions':campaign.transaction_set.filter(status=TRANSACTION_STATUS_ACTIVE).count(), - 'number of users with ACTIVE transactions': len(set([t.user for t in campaign.transaction_set.filter(status=TRANSACTION_STATUS_ACTIVE)])), - 'total amount of pledges in ACTIVE transactions': sum([t.amount for t in campaign.transaction_set.filter(status=TRANSACTION_STATUS_ACTIVE)]), - } \ No newline at end of file diff --git a/core/management/commands/check_works_integrity.py b/core/management/commands/check_works_integrity.py index 053168741..3ed212eec 100644 --- a/core/management/commands/check_works_integrity.py +++ b/core/management/commands/check_works_integrity.py @@ -7,24 +7,28 @@ class Command(BaseCommand): help = "Do a few integrity checks on Works, Editions, and Identifiers" def handle(self, **options): - print "Number of Works without identifiers: ", models.Work.objects.filter(identifiers__isnull=True).count() - print "Last 20 Works without identifiers: " + self.stdout.write("Number of Works without identifiers: {}".format( + models.Work.objects.filter(identifiers__isnull=True).count())) + self.stdout.write("Last 20 Works without identifiers: ") for w in models.Work.objects.filter(identifiers__isnull=True).order_by('-created')[0:20]: - print "id: %d | title: %s | created: %s" % (w.id, w.title, w.created) + self.stdout.write("id: %d | title: %s | created: %s" % (w.id, w.title, w.created)) # models.Work.objects.filter(identifiers__isnull=True).filter(editions__isnull=False)[0].identifiers.all() - print "Number of editions that are currently tied to Works w/o identifiers ", \ - models.Edition.objects.filter(work__identifiers__isnull=True).count() - print "Number of Identifiers not tied to Works (should be 0): ", \ - models.Identifier.objects.filter(work__isnull=True).count() - print "Number of Editions not tied to a Work (should be 0): ", models.Edition.objects.filter(work__isnull=True).count() - print "Number of Ebooks not tied to an Edition (should be 0): ", models.Ebook.objects.filter(edition__isnull=True).count() + self.stdout.write("Number of editions that are currently tied to Works w/o identifiers {}".format( + models.Edition.objects.filter(work__identifiers__isnull=True).count())) + self.stdout.write("Number of Identifiers not tied to Works (should be 0): {}".format( + models.Identifier.objects.filter(work__isnull=True).count())) + self.stdout.write("Number of Editions not tied to a Work (should be 0): {}".format( + models.Edition.objects.filter(work__isnull=True).count())) + self.stdout.write("Number of Ebooks not tied to an Edition (should be 0): {}".format( + models.Ebook.objects.filter(edition__isnull=True).count())) # is the possibility of problems coming from the fact that there are two places to tie # Work and Edition -- 1) foreign key Edition.work = models.ForeignKey("Work", related_name="editions", null=True) # 2) sharing the same Identifier. # check both that iff a pair of Work and Edition share an identifier, that Work and Edition have a foreign key relationship - print "Number of Works that have editions->identifiers that don't lead back to the same work (should be 0): ", models.Work.objects.filter(~Q(editions__identifiers__work__id = F('id'))).count() + self.stdout.write("Number of Works that have editions->identifiers that don't lead back to the same work (should be 0): {}".format( + models.Work.objects.filter(~Q(editions__identifiers__work__id = F('id'))).count())) # check that for all Identifier pairs with an Edition that Edition<->Work foreign key relationships ties the same Edition/Work - print "Number of Identifier pairs with an Edition in which Edition<->Work foreign key relationships does not tie the same Edition/Work (should be 0): ", \ - models.Identifier.objects.filter(edition__isnull=False).filter(~Q(edition__work__id = F('work__id'))).count() \ No newline at end of file + self.stdout.write("Number of Identifier pairs with an Edition in which Edition<->Work foreign key relationships does not tie the same Edition/Work (should be 0): {}".format( + models.Identifier.objects.filter(edition__isnull=False).filter(~Q(edition__work__id = F('work__id'))).count())) \ No newline at end of file diff --git a/core/management/commands/claim_by_isbn.py b/core/management/commands/claim_by_isbn.py index 00a7d00c7..3c67a9cad 100644 --- a/core/management/commands/claim_by_isbn.py +++ b/core/management/commands/claim_by_isbn.py @@ -9,13 +9,16 @@ class Command(BaseCommand): help = "claim books for rights_holder based on a text file of ISBNs" - args = " " + def add_arguments(self, parser): + parser.add_argument('rights_holder_id', nargs='+', type=int, help="rights_holder id") + parser.add_argument('filename', nargs='+', help="filename") + def handle(self, rights_holder_id, filename, **options): try: rh = models.RightsHolder.objects.get(id=int(rights_holder_id)) except models.Identifier.DoesNotExist: - print '{} not a rights_holder'.format(rights_holder_id) + self.stdout.write('{} not a rights_holder'.format(rights_holder_id)) return with open(filename) as f: for isbn in f: @@ -24,14 +27,14 @@ def handle(self, rights_holder_id, filename, **options): work = models.Identifier.objects.get(type='isbn',value=isbn).work try: c = models.Claim.objects.get(work=work) - print '{} already claimed by {}'.format(work, c.rights_holder) + self.stdout.write('{} already claimed by {}'.format(work, c.rights_holder)) except models.Claim.DoesNotExist: c = models.Claim.objects.create( work=work, rights_holder=rh, user=rh.owner, status='active') - print '{} claimed for {}'.format(work, rh) + self.stdout.write('{} claimed for {}'.format(work, rh)) except models.Identifier.DoesNotExist: - print '{} not loaded'.format(isbn) + self.stdout.write('{} not loaded'.format(isbn)) continue diff --git a/core/management/commands/clean_broken_gitenberg_ebooks.py b/core/management/commands/clean_broken_gitenberg_ebooks.py index b9defdef3..69a12cf82 100644 --- a/core/management/commands/clean_broken_gitenberg_ebooks.py +++ b/core/management/commands/clean_broken_gitenberg_ebooks.py @@ -1,6 +1,6 @@ from __future__ import print_function from itertools import islice -from urlparse import urlparse +from urllib.parse import urlparse import sys import requests @@ -33,10 +33,10 @@ def calc_problem_ebooks(): status_code = requests.head(ebook.url).status_code - if status_code <> 302: + if status_code != 302: non302statuscode_count += 1 - print ("\r", i, ebook.url, status_code, non302statuscode_count, end="") + self.stdout.write("\r", i, ebook.url, status_code, non302statuscode_count, end="") sys.stdout.flush() results.append( @@ -47,7 +47,7 @@ def calc_problem_ebooks(): ) - return [result for result in results if result['status_code'] <> 302] + return [result for result in results if result['status_code'] != 302] class Command(BaseCommand): @@ -56,15 +56,15 @@ class Command(BaseCommand): def handle(self, **options): problem_ebooks = calc_problem_ebooks() - print ("number of problem ebooks", len(problem_ebooks)) + self.stdout.write("number of problem ebooks", len(problem_ebooks)) # deactivate problem ebooks for (i, result) in enumerate(problem_ebooks): ebook = Ebook.objects.get(id=result['id']) - print ("\r", "deactivating ", i, ebook.id, end="") + self.stdout.write("\r", "deactivating ", i, ebook.id, end="") ebook.deactivate() # reload repos for (i, repo_name) in enumerate(set([repo_name_from_url(ebook['url']) for ebook in problem_ebooks])): - print ("reloading ", repo_name) + self.stdout.write("reloading ", repo_name) load_from_yaml(yaml_url(repo_name)) diff --git a/core/management/commands/clean_dangling_works.py b/core/management/commands/clean_dangling_works.py new file mode 100644 index 000000000..a7d9a3b4f --- /dev/null +++ b/core/management/commands/clean_dangling_works.py @@ -0,0 +1,22 @@ +from django.core.management.base import BaseCommand +from django.db.models import Count + +from regluit.core.models import Work, WasWork +from regluit.core.bookloader import merge_works + + + +class Command(BaseCommand): + '''remove works and editions without titles''' + help = "remove works and editions without titles" + + def handle(self, **options): + orphans = Work.objects.annotate(num_editions=Count('editions')).filter(num_editions=0) + for work in orphans: + self.stdout.write('cleaning %s' % work.title) + parent = None + for parent in WasWork.objects.filter(was=work.id): + # remerge into parent + merge_works(parent.work, work) + if not parent: + work.delete() diff --git a/core/management/commands/clean_db_strings.py b/core/management/commands/clean_db_strings.py index 2ce39ff60..4fefc6ddc 100644 --- a/core/management/commands/clean_db_strings.py +++ b/core/management/commands/clean_db_strings.py @@ -1,10 +1,8 @@ -from __future__ import print_function - from django.core.management.base import BaseCommand from django.db import IntegrityError from regluit.core import models -from regluit.utils.text import sanitize_line, remove_badxml +from regluit.utils.text import sanitize_line, remove_author_junk, remove_badxml class Command(BaseCommand): @@ -18,32 +16,40 @@ def handle(self, **options): work.title = sanitize_line(work.title) work.save() work_titles_fixed +=1 - if work.description and remove_badxml(work.description) != work.description: - work.description = remove_badxml(work.description) - work.save() - work_descriptions_fixed +=1 - print ("work_titles_fixed = {}".format(work_titles_fixed)) - print ("work_descriptions_fixed = {}".format(work_descriptions_fixed)) + if work.description: + save = False + if '\r\n' in work.description: + work.description = work.description.replace('\r\n', '\n') + save = True + if work.description and remove_badxml(work.description) != work.description: + work.description = remove_badxml(work.description) + save = True + if save: + work.save() + work_descriptions_fixed +=1 + self.stdout.write("work_titles_fixed = {}".format(work_titles_fixed)) + self.stdout.write("work_descriptions_fixed = {}".format(work_descriptions_fixed)) for edition in models.Edition.objects.all(): if sanitize_line(edition.title) != edition.title: edition.title = sanitize_line(edition.title) edition.save() edition_titles_fixed +=1 - print ("edition_titles_fixed = {}".format(edition_titles_fixed)) + self.stdout.write("edition_titles_fixed = {}".format(edition_titles_fixed)) for author in models.Author.objects.all(): - if sanitize_line(author.name) != author.name: - author.name = sanitize_line(author.name) - try: - author.save() - except IntegrityError as e: - # duplicate entry - correct = models.Author.objects.get(name=sanitize_line(author.name)) - for relator in author.relator_set.all(): - relator.author = correct - relator.save() - author.delete() - author_names_fixed +=1 - print ("author_names_fixed = {}".format(author_names_fixed)) + if remove_author_junk(sanitize_line(author.name)) != author.name: + author.name = remove_author_junk(sanitize_line(author.name)) + if author.name: + try: + author.save() + except IntegrityError as e: + # duplicate entry + correct = models.Author.objects.get(name=sanitize_line(author.name)) + for relator in author.relator_set.all(): + relator.author = correct + relator.save() + author.delete() + author_names_fixed +=1 + self.stdout.write("author_names_fixed = {}".format(author_names_fixed)) for publishername in models.PublisherName.objects.all(): if sanitize_line(publishername.name) != publishername.name: publishername.name = sanitize_line(publishername.name) @@ -60,4 +66,4 @@ def handle(self, **options): publisher.save() publishername.delete() publisher_names_fixed +=1 - print ("publisher_names_fixed = {}".format(publisher_names_fixed)) + self.stdout.write("publisher_names_fixed = {}".format(publisher_names_fixed)) diff --git a/core/management/commands/clean_languages.py b/core/management/commands/clean_languages.py new file mode 100644 index 000000000..c98383832 --- /dev/null +++ b/core/management/commands/clean_languages.py @@ -0,0 +1,20 @@ +from django.core.management.base import BaseCommand + +from regluit.core.models import Work +from regluit.utils.lang import lang_to_language_code, lang_and_locale, iso639 + +iso639 = r'^[a-z][a-z][a-z]?$' +lang_and_locale = r'^[a-z][a-z]\-[A-Z][A-Z]$' + +class Command(BaseCommand): + '''remove works and editions without titles''' + help = "remove works and editions without titles" + + def handle(self, **options): + badworks = Work.objects.exclude(language__regex=iso639) + badworks = badworks.exclude(language__regex=lang_and_locale) + self.stdout.write('{} works to fix'.format(badworks.count())) + for work in badworks: + language = lang_to_language_code(work.language) + work.language = language if language else 'xx' + work.save() diff --git a/core/management/commands/convert_campaign_ebooks_to_mobi.py b/core/management/commands/convert_campaign_ebooks_to_mobi.py deleted file mode 100644 index 279b51914..000000000 --- a/core/management/commands/convert_campaign_ebooks_to_mobi.py +++ /dev/null @@ -1,24 +0,0 @@ -from itertools import islice -from django.core.management.base import BaseCommand -from regluit.core import (mobigen, tasks) - - -class Command(BaseCommand): - help = "For campaign works without a mobi ebook, generate mobi ebooks where possible." - args = " " - - def handle(self, limit=None, async=True, **options): - - if limit is not None: - limit = int(limit) - - for (i, edition) in enumerate(islice(mobigen.editions_to_convert(), limit)): - - print (i, edition.work.get_absolute_url()) - - if async: - task = tasks.generate_mobi_ebook_for_edition.delay(edition) - print (task.id) - else: - ebook = mobigen.generate_mobi_ebook_for_edition(edition) - print (ebook.id) diff --git a/core/management/commands/dedupe_doab.py b/core/management/commands/dedupe_doab.py new file mode 100644 index 000000000..097db2ad6 --- /dev/null +++ b/core/management/commands/dedupe_doab.py @@ -0,0 +1,28 @@ +from django.core.management.base import BaseCommand +from django.db.models import Count,Subquery, OuterRef, IntegerField + +from regluit.core.loaders.doab import get_doab_record +from regluit.core.models import Work, Identifier + + +class Command(BaseCommand): + help = "remove duplicate doab ids " + + def handle(self, **options): + doab_works = Work.objects.annotate( + doab_count=Subquery( + Identifier.objects.filter( + type='doab', + work=OuterRef('pk') + ).values('work') + .annotate(cnt=Count('pk')) + .values('cnt'), + output_field=IntegerField() + ) + ) + for w in doab_works.filter(doab_count__gt=1): + for ident in w.identifiers.filter(type="doab"): + record = get_doab_record(ident.value) + if not record: + self.stdout.write('removing %s' % ident.value) + ident.delete() diff --git a/core/management/commands/dedupe_ebooks_with_same_urls.py b/core/management/commands/dedupe_ebooks_with_same_urls.py index e2d53ede7..b77764d65 100644 --- a/core/management/commands/dedupe_ebooks_with_same_urls.py +++ b/core/management/commands/dedupe_ebooks_with_same_urls.py @@ -7,17 +7,17 @@ def delete_newest_ebooks(ebooks): given a list of ebooks (presumably with the same URL), delete all but the ebook that was created first """ for ebook in sorted(ebooks, key=lambda ebook: ebook.created)[1:]: - print "deleting ebook.id {}, edition.id {} work.id {}".format(ebook.id, + self.stdout.write("deleting ebook.id {}, edition.id {} work.id {}".format(ebook.id, ebook.edition_id, - ebook.edition.work_id) + ebook.edition.work_id)) ebook.delete() intact = ebooks[0] - print "leaving undeleted: ebook.id {}, edition.id {} work.id {}".format( + print("leaving undeleted: ebook.id {}, edition.id {} work.id {}".format( intact.id, intact.edition_id, intact.edition.work_id - ) + )) class Command(BaseCommand): @@ -34,5 +34,5 @@ def handle(self, **options): # look through the URLs locating ones with more than one ebook for (url, ebooks) in ebooks_by_url.items(): if len(ebooks) > 1: - print (url, len(ebooks)) + self.stdout.write(url, len(ebooks)) delete_newest_ebooks(ebooks) \ No newline at end of file diff --git a/core/management/commands/dedupe_onlines.py b/core/management/commands/dedupe_onlines.py new file mode 100644 index 000000000..92747eb81 --- /dev/null +++ b/core/management/commands/dedupe_onlines.py @@ -0,0 +1,27 @@ +from django.core.management.base import BaseCommand + +from django.db.models import Count +from regluit.core.models import Work, Ebook, EbookFile + +class Command(BaseCommand): + help = "remove old online ebooks from same provider" + + def handle(self, **options): + allonlines = Work.objects.filter(editions__ebooks__format='online').distinct() + self.stdout.write('%s works with online ebooks' % allonlines.count()) + removed = 0 + for work in allonlines: + onlines = Ebook.objects.filter( + edition__work__id=work.id, + format='online' + ).order_by('-created') + num_onlines = onlines.count() + if num_onlines >= 2: + new_provider = onlines[0].provider + for online in onlines[1:]: + harvested = EbookFile.objects.filter(source=online.url).count() + if not harvested and online.provider == new_provider: + self.stdout.write(online.edition.work.title) + online.delete() + removed += 1 + self.stdout.write('%s online ebooks removed' % removed) \ No newline at end of file diff --git a/core/management/commands/delete_empty_works.py b/core/management/commands/delete_empty_works.py new file mode 100644 index 000000000..4a7cf387c --- /dev/null +++ b/core/management/commands/delete_empty_works.py @@ -0,0 +1,18 @@ +from django.core.management.base import BaseCommand + +from regluit.core.models import Work + + + +class Command(BaseCommand): + '''remove works and editions without titles''' + help = "remove works and editions without titles" + + def handle(self, **options): + badworks = Work.objects.filter(title='') + + for work in badworks: + work.selected_edition = None + for edition in work.editions.all(): + edition.delete() + work.delete() diff --git a/core/management/commands/delete_subjects_with_commas.py b/core/management/commands/delete_subjects_with_commas.py index 078784c44..6e4c8b185 100644 --- a/core/management/commands/delete_subjects_with_commas.py +++ b/core/management/commands/delete_subjects_with_commas.py @@ -14,5 +14,5 @@ def handle(self, **options): for subject in comma_subjects: num_commas = len(subject.name.split(','))-1 if num_commas >2: - print subject.name + self.stdout.write(subject.name) subject.delete() diff --git a/core/management/commands/delete_subjects_with_nonxml.py b/core/management/commands/delete_subjects_with_nonxml.py index e4c698fe0..b3a344f3e 100644 --- a/core/management/commands/delete_subjects_with_nonxml.py +++ b/core/management/commands/delete_subjects_with_nonxml.py @@ -22,7 +22,10 @@ class Command(BaseCommand): def handle(self, **options): bad_subjects = [subject for subject in Subject.objects.all() if clean_string(subject.name) != subject.name] - print ("number of bad subjects:", len(bad_subjects)) + print("number of bad subjects: %s" % len(bad_subjects)) for bad_subject in bad_subjects: - print (bad_subject.name.encode('ascii', 'ignore'), bad_subject.works.count()) + self.stdout.write('{}, {}'.format( + bad_subject.name.encode('ascii', 'ignore'), + bad_subject.works.count() + )) bad_subject.delete() diff --git a/core/management/commands/despam_descriptions.py b/core/management/commands/despam_descriptions.py deleted file mode 100644 index d70c0a4e0..000000000 --- a/core/management/commands/despam_descriptions.py +++ /dev/null @@ -1,17 +0,0 @@ -from django.core.management.base import BaseCommand - -from regluit.core import models, bookloader - -class Command(BaseCommand): - help = "check description db for free ebook spam" - - def handle(self, **options): - spam_strings=["1stWorldLibrary.ORG", "GeneralBooksClub.com", "million-books.com", "AkashaPublishing.Com"] - for spam_string in spam_strings: - qs=models.Work.objects.filter(description__icontains=spam_string) - print "Number of Works with %s in description: %s" % (spam_string, qs.count()) - - for work in qs: - work.description = bookloader.despam_description(work.description) - print "updating work %s" % work - bookloader.add_openlibrary(work, hard_refresh = True) diff --git a/core/management/commands/dump_emails.py b/core/management/commands/dump_emails.py deleted file mode 100644 index a8975eb41..000000000 --- a/core/management/commands/dump_emails.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -print user emails -""" - -from django.core.management.base import BaseCommand -from django.contrib.auth.models import User - -from regluit.core import models - -class Command(BaseCommand): - help = "dump all ungluer emails" - - def handle(self, **options): - num=0 - - for user in User.objects.all(): - print user.email - num=num+1 - print "Number of emails= %s" % num diff --git a/core/management/commands/fail_campaign_amazon.py b/core/management/commands/fail_campaign_amazon.py deleted file mode 100644 index 8a73d2ace..000000000 --- a/core/management/commands/fail_campaign_amazon.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -print user emails -""" - -from django.core.management.base import BaseCommand - -from regluit.core import models, signals - -class Command(BaseCommand): - help = "set active campaigns to unsuccessful" - - def handle(self, **options): - for campaign in models.Campaign.objects.filter(status='ACTIVE'): - campaign.status = 'UNSUCCESSFUL' - campaign.save() - action = models.CampaignAction(campaign=campaign, type='failed', comment = 'amazon suspension') - action.save() - signals.amazon_suspension.send(sender=None,campaign=campaign) - print 'campaign %s set to UNSUCCESSFUL' % campaign.id diff --git a/core/management/commands/fix_avatars.py b/core/management/commands/fix_avatars.py new file mode 100644 index 000000000..1383168e2 --- /dev/null +++ b/core/management/commands/fix_avatars.py @@ -0,0 +1,16 @@ +import string +from django.core.management.base import BaseCommand +from regluit.core.models import UNGLUEITAR +from regluit.libraryauth.auth import pic_storage_url + +from regluit.core import models + +class Command(BaseCommand): + help = "fix avatar urls and settings" + + def handle(self, **options): + for profile in models.UserProfile.objects.exclude(pic_url=''): + self.stdout.write("updating user %s" % profile.user) + if not profile.pic_url.startswith('https://unglueit'): + profile.pic_url = pic_storage_url(profile.user, 'twitter', profile.pic_url) + profile.save() diff --git a/core/management/commands/fix_covers.py b/core/management/commands/fix_covers.py new file mode 100644 index 000000000..942a796d7 --- /dev/null +++ b/core/management/commands/fix_covers.py @@ -0,0 +1,106 @@ +from django.core.management.base import BaseCommand + +import re +import requests +from regluit.core.models import Edition, Work +from regluit.core.loaders.doab import store_doab_cover + +to_fix = [ +"20.500.12854/88675", +"20.500.12854/88677", +"20.500.12854/88678", +"20.500.12854/88679", +"20.500.12854/88680", +"20.500.12854/88681", +"20.500.12854/88683", +"20.500.12854/88686", +"20.500.12854/88687", +"20.500.12854/89178", +"20.500.12854/89252", +"20.500.12854/89255", +"20.500.12854/89257", +"20.500.12854/89260", +"20.500.12854/89265", +"20.500.12854/89441", +"20.500.12854/89490", +"20.500.12854/89496", +"20.500.12854/89498", +"20.500.12854/89514", +"20.500.12854/91350", +"20.500.12854/96212", +] +missing = [ +] + +class Command(BaseCommand): + """ To repair covers, will need a new refresh_cover method""" + help = "fix bad covers" + + def add_arguments(self, parser): + parser.add_argument('doab', nargs='?', default='', help="doab to fix") + + def handle(self, doab, **options): + if doab == 'mangled': + self.fix_mangled_covers() + elif doab == 'list': + for doab_id in to_fix: + self.fix_doab_cover(doab_id) + return True + elif doab == 'null': + no_cover_doab = Work.objects.filter(identifiers__type='doab').exclude(editions__cover_image__isnull=False) + for work in no_cover_doab: + cover_url = self.refresh_cover(work.doab) + if cover_url: + for e in work.editions.all(): + e.cover_image = cover_url + e.save() + self.stdout.write(f'added cover for {work.doab}') + else: + return self.fix_doab_cover(doab) + return False + + def fix_doab_cover(self, doab): + eds = Edition.objects.filter(cover_image__contains=doab) + + cover_url = self.refresh_cover(doab) + if cover_url: + for e in eds: + e.cover_image = cover_url + e.save() + if e.cover_image_small() and e.cover_image_thumbnail(): + self.stdout.write('fixed %s using %s' % (doab, cover_url)) + else: + self.stdout.write('bad thumbnails for %s' % cover_url) + return False + return True + self.stdout.write('removing bad cover for %s' % doab) + + for e in eds: + e.cover_image = None + e.save() + return False + + def fix_mangled_covers(self): + eds = Edition.objects.filter(cover_image__contains='amazonaws.comdoab') + for ed in eds: + cover_url = ed.cover_image.replace('amazonaws.comdoab', 'amazonaws.com/doab') + ed.cover_image = cover_url + ed.save() + self.stdout.write('fixed %s mangled covers' % eds.count()) + eds = Edition.objects.exclude(cover_image__startswith='http').filter(cover_image__regex='.') + for ed in eds: + ed.cover_image = '' + ed.save() + self.stdout.write('fixed %s file covers' % eds.count()) + fixed = 0 + for cover in missing: + eds = Edition.objects.filter(cover_image=cover) + for ed in eds: + ed.cover_image = '' + ed.save() + fixed += 1 + self.stdout.write('fixed %s file covers' % fixed) + + def refresh_cover(self, doab): + new_cover, created = store_doab_cover(doab, redo=True) + return new_cover diff --git a/core/management/commands/fix_inactive.py b/core/management/commands/fix_inactive.py new file mode 100644 index 000000000..17f1c4538 --- /dev/null +++ b/core/management/commands/fix_inactive.py @@ -0,0 +1,23 @@ +import re + +from django.core.management.base import BaseCommand + +from django.db.models import Sum + +from regluit.core.models import Work, Ebook +from regluit.core.loaders.harvest import DOWNLOADABLE + + +class Command(BaseCommand): + help = "fix inactive Ebooks" + + def handle(self, **options): + + qs = Work.objects.annotate(num_free=Sum('editions__ebook_files')).filter(num_free__gt=0) + self.stdout.write(str(qs.filter(is_free=False).count())) + for free in qs.filter(is_free=False): + for ebook in Ebook.objects.filter(edition__work_id=free.id, format__in=DOWNLOADABLE).order_by('-created'): + ebook.activate() + break + self.stdout.write(str(qs.filter(is_free=False).count())) + diff --git a/core/management/commands/fix_mdpi.py b/core/management/commands/fix_mdpi.py new file mode 100644 index 000000000..79a13207b --- /dev/null +++ b/core/management/commands/fix_mdpi.py @@ -0,0 +1,75 @@ +import re + +from django.core.management.base import BaseCommand + +from regluit.core.models import Ebook, EbookFile + + +class Command(BaseCommand): + help = "fix mdpi Ebooks" + + def handle(self, **options): + mdpi_match = re.compile(r'https://res.mdpi.com/bookfiles/book/(\d+)(.*)\?v=\d+') + + mdpi_ebs = Ebook.objects.filter(url__startswith='https://res.mdpi.com/bookfiles/book/', url__contains="?v=") + mdpi_ebfs = EbookFile.objects.filter(source__startswith='https://res.mdpi.com/bookfiles/book/', source__contains="?v=") + self.stdout.write('Ebooks %s, Ebook Files %s' % (mdpi_ebs.count(), mdpi_ebfs.count())) + + done = [] + for ebf in mdpi_ebfs.order_by('-created'): + match_ebf = mdpi_match.match(ebf.source) + if match_ebf: + bookno = match_ebf.group(1) + if bookno in done: + continue + else: + done.append(bookno) + stem = ebf.source.split('?')[0] + online_url = 'https://www.mdpi.com/books/pdfview/book/' + bookno + size = ebf.ebook.filesize + + # change the ebook provider to unglue.it + if ebf.ebook.provider != 'Unglue.it': + ebf.ebook.provider = 'Unglue.it' + ebf.ebook.url = ebf.file.url + ebf.ebook.active = True + ebf.ebook.save() + + # create the online ebook that should have been + online=Ebook.objects.get_or_create(format='online', url=online_url, edition=ebf.edition, + active=False, rights=ebf.ebook.rights, provider='MDPI Books') + + # reset ebf source + ebf.source = online_url + ebf.save() + + # check for duplicate ebfs + for old_ebook in mdpi_ebs.filter(url__contains='/' + bookno + '/').exclude(id=ebf.id).order_by('-created'): + old_ebook.active = False + for oldebf in old_ebook.ebook_files.exclude(id=ebf.id): + if oldebf.file != ebf.file: + # save storage by deleting redundant files + oldebf.file.delete() + oldebf.file = ebf.file + oldebf.source = ebf.source.split('?')[0] + oldebf.save() + old_ebook.save() + + # now make the rest of the ebooks onlines + done = [] + for eb in mdpi_ebs.filter(active=True): + match_eb = mdpi_match.match(eb.url) + if match_eb: + # make sure not already harvested + if eb.ebook_files.count(): + self.stdout.write('ebook %s already harvested' % eb.id) + continue + bookno = match_eb.group(1) + eb.active = False + if bookno in done: + eb.active = False + eb.url = eb.url.split('?')[0] + eb.save() + + + diff --git a/core/management/commands/fix_ol_descriptions.py b/core/management/commands/fix_ol_descriptions.py index 4564736e1..0683c999d 100644 --- a/core/management/commands/fix_ol_descriptions.py +++ b/core/management/commands/fix_ol_descriptions.py @@ -6,8 +6,8 @@ class Command(BaseCommand): help = "do OL relookup if description contains { " def handle(self, **options): - print "Number of Works with { in description: %s" % models.Work.objects.filter(description__contains='{').count() + self.stdout.write("Number of Works with { in description: %s" % models.Work.objects.filter(description__contains='{').count()) for work in models.Work.objects.filter(description__contains='{'): - print "updating work %s" % work + self.stdout.write("updating work %s" % work) bookloader.add_openlibrary(work, hard_refresh = True) diff --git a/core/management/commands/fix_online_ebooks.py b/core/management/commands/fix_online_ebooks.py new file mode 100644 index 000000000..2f8a21031 --- /dev/null +++ b/core/management/commands/fix_online_ebooks.py @@ -0,0 +1,27 @@ +from django.core.management.base import BaseCommand + +from regluit.core.loaders.doab_utils import online_to_download +from regluit.core.models import Ebook + +class Command(BaseCommand): + help = "deactivate dead oapen ebooks" + args = "" + + def add_arguments(self, parser): + parser.add_argument('limit', nargs='?', type=int, default=0, help="max to fix") + + def handle(self, limit=0, **options): + limit = int(limit) if limit else 0 + onlines = Ebook.objects.filter(active=1, provider='OAPEN Library', + url__contains='/download/') + done = 0 + for online in onlines: + online.active = False + online.save() + done += 1 + #self.stdout.write(online.edition.work.title) + if done > limit: + break + self.stdout.write('fixed {} ebooks'.format(done)) + if done >= 1000: + self.stdout.write('1000 is the maximum; repeat to do more') diff --git a/core/management/commands/fix_stuff.py b/core/management/commands/fix_stuff.py new file mode 100644 index 000000000..f169f6277 --- /dev/null +++ b/core/management/commands/fix_stuff.py @@ -0,0 +1,23 @@ +from django.core.management.base import BaseCommand + +from regluit.core.models import EbookFile, Ebook +from regluit.core.loaders.soup import get_soup + +class Command(BaseCommand): + + def handle(self, **options): + for ebf in EbookFile.objects.filter(ebook__isnull=True, source__isnull=False): + ebf.delete() + for ebf in EbookFile.objects.filter(ebook__filesize=0): + try: + ebf.ebook.filesize = ebf.file.size + ebf.ebook.save() + except: + pass + for ebf in EbookFile.objects.filter(ebook__filesize__isnull=True): + try: + ebf.ebook.filesize = ebf.file.size + ebf.ebook.save() + except: + pass + diff --git a/core/management/commands/fix_twitter_avatars.py b/core/management/commands/fix_twitter_avatars.py deleted file mode 100644 index 520513093..000000000 --- a/core/management/commands/fix_twitter_avatars.py +++ /dev/null @@ -1,16 +0,0 @@ -import string -from django.core.management.base import BaseCommand -from regluit.core.models import TWITTER - -from regluit.core import models - -class Command(BaseCommand): - help = "fix old twitter avatar urls" - - def handle(self, **options): - print "Number of users affected with : %s" % models.UserProfile.objects.filter( pic_url__contains='//si0.twimg.com').count() - - for profile in models.UserProfile.objects.filter(pic_url__contains='//si0.twimg.com'): - print "updating user %s" % profile.user - profile.pic_url = string.replace( profile.pic_url, '//si0.twimg.com','//pbs.twimg.com') - profile.save() diff --git a/core/management/commands/goodreads_books.py b/core/management/commands/goodreads_books.py deleted file mode 100644 index 49c88b51c..000000000 --- a/core/management/commands/goodreads_books.py +++ /dev/null @@ -1,16 +0,0 @@ -from itertools import islice - -from django.conf import settings -from django.core.management.base import BaseCommand - -from regluit.core.goodreads import GoodreadsClient - -class Command(BaseCommand): - help = "list books on given user bookshelf" - args = "" - - def handle(self, user_id, shelf_name, max_books, **options): - max_books = int(max_books) - gc = GoodreadsClient(key=settings.GOODREADS_API_KEY, secret=settings.GOODREADS_API_SECRET) - for (i, review) in enumerate(islice(gc.review_list(user_id,shelf=shelf_name),max_books)): - print i, review["book"]["title"], review["book"]["isbn10"], review["book"]["small_image_url"] \ No newline at end of file diff --git a/core/management/commands/goodreads_load_books.py b/core/management/commands/goodreads_load_books.py deleted file mode 100644 index 0a61f1c0b..000000000 --- a/core/management/commands/goodreads_load_books.py +++ /dev/null @@ -1,19 +0,0 @@ -from django.conf import settings -from django.contrib.auth.models import User -from django.core.management.base import BaseCommand - -from regluit.core import tasks, bookloader -from regluit.core.goodreads import GoodreadsClient - -#from regluit.core.goodreads import load_shelf_into_wishlist - -class Command(BaseCommand): - help = "list books on given user bookshelf" - args = "" - - def handle(self, user_name, goodreads_user_id, shelf_name, max_books, **options): - - user = User.objects.get(username=user_name) - max_books = int(max_books) - - tasks.load_goodreads_shelf_into_wishlist.delay(user.id, shelf_name, goodreads_user_id, max_books) \ No newline at end of file diff --git a/core/management/commands/harvest_manual_ebooks.py b/core/management/commands/harvest_manual_ebooks.py new file mode 100644 index 000000000..a5a7de89b --- /dev/null +++ b/core/management/commands/harvest_manual_ebooks.py @@ -0,0 +1,30 @@ +from random import shuffle +from django.core.management.base import BaseCommand + +from regluit.core.loaders.harvest import harvest_manual +from regluit.core.models import Ebook + +class Command(BaseCommand): + help = "load manually harvested ebooks" + + def add_arguments(self, parser): + parser.add_argument('--ebook', nargs='?', type=int, default=0, help="ebook to harvest") + parser.add_argument('--provider', nargs='?', default='', help="provider to harvest") + + def handle(self, limit=0, trace=False, **options): + if options.get('ebook'): + onlines = Ebook.objects.filter(id=options.get('ebook')) + elif options.get('provider'): + onlines = Ebook.objects.filter(provider=options.get('provider')) + self.stdout.write('%s onlines to check' % onlines.count()) + done = 0 + providers = {} + + for online in onlines: + new_ebf, new = harvest_manual(online) + if new_ebf and new: + done += new + providers[online.provider] = providers.get(online.provider, 0) + 1 + self.stdout.write(new_ebf.edition.work.title) + self.stdout.write('harvested {} ebooks'.format(done)) + self.stdout.write(str(providers)) diff --git a/core/management/commands/harvest_online_ebooks.py b/core/management/commands/harvest_online_ebooks.py index 06aeeab91..85fcd71a9 100644 --- a/core/management/commands/harvest_online_ebooks.py +++ b/core/management/commands/harvest_online_ebooks.py @@ -1,21 +1,52 @@ +from random import shuffle from django.core.management.base import BaseCommand -from regluit.core.loaders.utils import dl_online +from regluit.core.loaders.harvest import dl_online, rl, CMPPROVIDERS from regluit.core.models import Ebook class Command(BaseCommand): help = "harvest downloadable ebooks from 'online' ebooks" args = "" - - def handle(self, limit=0, **options): + + def add_arguments(self, parser): + parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest") + parser.add_argument('--ebook', nargs='?', type=int, default=0, help="ebook to harvest") + parser.add_argument('--provider', nargs='?', default='', help="provider to harvest") + parser.add_argument('--format', nargs='?', default='online', help="format to harvest") + parser.add_argument('--trace', action='store_true', help="trace") + + def handle(self, limit=0, trace=False, **options): limit = int(limit) if limit else 0 - onlines = Ebook.objects.filter(format='online') + #rl = RateLimiter() + format = options.get('format') + if options.get('ebook'): + onlines = Ebook.objects.filter(id=options.get('ebook')) + elif options.get('provider'): + provider = options.get('provider') + if provider == 'CMPPROVIDERS': + onlines = Ebook.objects.filter(provider__in=CMPPROVIDERS) + else: + onlines = Ebook.objects.filter(provider=provider, format=format) + self.stdout.write('%s onlines to check' % onlines.count()) + else: + online_ids = [ebook.id for ebook in Ebook.objects.filter(format=format)] + self.stdout.write('%s onlines to check' % len(online_ids)) + shuffle(online_ids) + onlines = (Ebook.objects.get(id=id) for id in online_ids) done = 0 + providers = {} + for online in onlines: - new_ebf, new = dl_online(online) + if trace: + self.stdout.write(str(online.id)) + new_ebf, new = dl_online(online, limiter=rl.delay, format=format) if new_ebf and new: - done += 1 - if done > limit: + done += new + providers[online.provider] = providers.get(online.provider, 0) + 1 + self.stdout.write(new_ebf.edition.work.title) + if done >= limit or done >= 500: break - print 'harvested {} ebooks'.format(done) - + self.stdout.write('harvested {} ebooks'.format(done)) + self.stdout.write(str(providers)) + if done >= 500: + self.stdout.write('500 is the maximum; repeat to do more') diff --git a/core/management/commands/harvest_remote_ebooks.py b/core/management/commands/harvest_remote_ebooks.py new file mode 100644 index 000000000..afa1e9209 --- /dev/null +++ b/core/management/commands/harvest_remote_ebooks.py @@ -0,0 +1,63 @@ +from random import shuffle +from django.core.management.base import BaseCommand + +from regluit.core.loaders.harvest import archive_dl, RateLimiter, DONT_HARVEST +from regluit.core.models import Ebook +from regluit.core.parameters import GOOD_PROVIDERS +DOWNLOADABLE = ['pdf', 'epub'] + +DONT_CHECK = list(GOOD_PROVIDERS) + DONT_HARVEST + +class Command(BaseCommand): + help = "check/harvest ebooks from 'remote' ebooks" + args = "" + + def add_arguments(self, parser): + parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest") + parser.add_argument('--ebook', nargs='?', type=int, default=0, help="ebook to harvest") + parser.add_argument('--provider', nargs='?', default='', help="provider to harvest") + parser.add_argument('--format', nargs='?', default='all', help="format to harvest") + parser.add_argument('--trace', action='store_true', help="trace") + + def handle(self, limit=0, trace=False, **options): + limit = int(limit) if limit else 0 + rl = RateLimiter() + format = options.get('format') + if format == 'all': + onlines = Ebook.objects.filter(format__in=DOWNLOADABLE) + else: + onlines = Ebook.objects.filter(format=format) + if options.get('ebook'): + onlines = Ebook.objects.filter(id=options.get('ebook')) + elif options.get('provider'): + onlines = onlines.filter(provider=options.get('provider')) + else: + onlines = onlines.exclude(provider__in=DONT_CHECK) + online_ids = [ebook.id for ebook in onlines] + self.stdout.write('%s ebooks need checking.' % len(onlines)) + shuffle(online_ids) + onlines = (Ebook.objects.get(id=id) for id in online_ids) + archived = {} + failed = {} + done = 0 + for online in onlines: + if trace: + self.stdout.write(str(online.id)) + status = archive_dl(online, limiter=rl.delay) + if status == 1: + done += 1 + archived[online.provider] = archived.get(online.provider, 0) + 1 + self.stdout.write(online.edition.title) + elif status == -1: + done += 1 + failed[online.provider] = failed.get(online.provider, 0) + 1 + online.format = 'online' + online.active = False + online.save() + if done >= limit or done >= 2000: + break + self.stdout.write("archived") + for result in [archived, failed]: + for provider in result: + self.stdout.write('%s\t%s' % (provider, result[provider])) + self.stdout.write("failed") diff --git a/core/management/commands/initialize_pledge_badges.py b/core/management/commands/initialize_pledge_badges.py deleted file mode 100644 index a1849e84b..000000000 --- a/core/management/commands/initialize_pledge_badges.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -set the 'pledged' badge for people who've pledged -""" - -from django.core.management.base import BaseCommand -from django.contrib.auth.models import User - -from regluit.core.models import Badge -from regluit.payment.models import Transaction - -class Command(BaseCommand): - help = "for people who've pledged, give them a badge!" - - - def handle(self, **options): - pledger= Badge.objects.get(name='pledger') - pledger2= Badge.objects.get(name='pledger2') - print 'start' - print 'pledger badges: %s' % pledger.holders.all().count() - print 'pledger2 badges: %s' % pledger2.holders.all().count() - pledges=Transaction.objects.exclude(status='NONE').exclude(status='Canceled',reason=None).exclude(anonymous=True) - for pledge in pledges: - if pledge.user.profile.badges.all().count(): - if pledge.user.profile.badges.all()[0].id == pledger.id: - pledge.user.profile.badges.remove(pledger) - pledge.user.profile.badges.add(pledger2) - else: - pledge.user.profile.badges.add(pledger) - print 'end' - print 'pledger badges: %s' % pledger.holders.all().count() - print 'pledger2 badges: %s' % pledger2.holders.all().count() - - - - diff --git a/core/management/commands/librarything_load_books.py b/core/management/commands/librarything_load_books.py deleted file mode 100644 index b43e72439..000000000 --- a/core/management/commands/librarything_load_books.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.conf import settings -from django.contrib.auth.models import User -from django.core.management.base import BaseCommand - -from regluit.core import librarything, tasks - -class Command(BaseCommand): - help = "load Librarything books into wishlist" - args = "" - - def handle(self, user_name, lt_username, max_books, **options): - - user = User.objects.get(username=user_name) - max_books = int(max_books) - - tasks.load_librarything_into_wishlist.delay(user.id, lt_username, max_books) \ No newline at end of file diff --git a/core/management/commands/librarything_load_books_2.py b/core/management/commands/librarything_load_books_2.py deleted file mode 100644 index e8cffc802..000000000 --- a/core/management/commands/librarything_load_books_2.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.conf import settings -from django.contrib.auth.models import User -from django.core.management.base import BaseCommand - -from regluit.core import librarything, tasks - -class Command(BaseCommand): - help = "load Librarything books into wishlist" - args = "" - - def handle(self, lt_username, **options): - - lt = librarything.LibraryThing(username=lt_username) - for (i, book) in enumerate(lt.parse_user_catalog(view_style=5)): - print i, book["title"], book["isbn"], book["work_id"], book["book_id"] - \ No newline at end of file diff --git a/core/management/commands/list_editions.py b/core/management/commands/list_editions.py deleted file mode 100644 index ff1f93794..000000000 --- a/core/management/commands/list_editions.py +++ /dev/null @@ -1,11 +0,0 @@ -from django.core.management.base import BaseCommand - -from regluit.core import models - -class Command(BaseCommand): - help = "list all editions in the database" - - def handle(self, *args, **options): - editions = models.Edition.objects.all() - for edition in editions: - print edition.id, edition.title, edition.isbn_10, edition.isbn_13 diff --git a/core/management/commands/list_queued_notices.py b/core/management/commands/list_queued_notices.py deleted file mode 100644 index 5f40bd0af..000000000 --- a/core/management/commands/list_queued_notices.py +++ /dev/null @@ -1,11 +0,0 @@ -import pickle - -import notification -from django.core.management.base import BaseCommand - -class Command(BaseCommand): - help = "Displays currently queues notices from django-notification" - - def handle(self, **options): - for (i, queued_batch) in enumerate(notification.models.NoticeQueueBatch.objects.all()): - print i, queued_batch.id, pickle.loads(str(queued_batch.pickled_data).decode("base64")) diff --git a/core/management/commands/load_books.py b/core/management/commands/load_books.py index 57faa7d94..b530ec1fe 100644 --- a/core/management/commands/load_books.py +++ b/core/management/commands/load_books.py @@ -4,13 +4,14 @@ class Command(BaseCommand): help = "load books based on a text file of ISBNs" - args = "" + def add_arguments(self, parser): + parser.add_argument('filename', nargs='+', help="filename") def handle(self, filename, **options): for isbn in open(filename): isbn = isbn.strip() edition = bookloader.add_by_isbn(isbn) if edition: - print "loaded %s as %s" % (isbn, edition) + self.stdout.write("loaded %s as %s" % (isbn, edition)) else: - print "failed to load book for %s" % isbn + self.stdout.write("failed to load book for %s" % isbn) diff --git a/core/management/commands/load_books_from_onix_csv.py b/core/management/commands/load_books_from_onix_csv.py index 66610f544..05538bcf0 100644 --- a/core/management/commands/load_books_from_onix_csv.py +++ b/core/management/commands/load_books_from_onix_csv.py @@ -5,9 +5,10 @@ class Command(BaseCommand): help = "load books based on a csv spreadsheet of onix data" - args = "" + def add_arguments(self, parser): + parser.add_argument('filename', nargs='+', help="filename") def handle(self, filename, **options): sheetreader= UnicodeDictReader(open(filename,'rU'), dialect=csv.excel) load_from_books(sheetreader) - print "finished loading" + self.stdout.write("finished loading") diff --git a/core/management/commands/load_books_from_onix_tsv.py b/core/management/commands/load_books_from_onix_tsv.py index dfa82aa2d..d6eb2f0b7 100644 --- a/core/management/commands/load_books_from_onix_tsv.py +++ b/core/management/commands/load_books_from_onix_tsv.py @@ -5,9 +5,10 @@ class Command(BaseCommand): help = "load books based on a csv spreadsheet of onix data" - args = "" + def add_arguments(self, parser): + parser.add_argument('filename', nargs='+', help="filename") def handle(self, filename, **options): sheetreader= UnicodeDictReader(open(filename,'rU'), dialect=csv.excel_tab) load_from_books(sheetreader) - print "finished loading" + pself.stdout.write("finished loading") diff --git a/core/management/commands/load_books_from_sitemap.py b/core/management/commands/load_books_from_sitemap.py index dcc886ad3..5668472c8 100644 --- a/core/management/commands/load_books_from_sitemap.py +++ b/core/management/commands/load_books_from_sitemap.py @@ -35,6 +35,19 @@ def handle(self, url, max=None, **options): if max and max < 0: break else: - books = add_by_sitemap(url, maxnum=max) + books = add_by_sitemap(url, maxnum=max) + + for edition in books: + done_fmt = set() + for ebook in edition.work.ebooks_all(): + for fmt in ['pdf', 'epub', 'mobi']: + if ebook.format == fmt: + if fmt not in done_fmt: + ebook.activate() + done_fmt.add(fmt) + else: + ebook.deactivate() + + - print "loaded {} books".format(len(books)) + self.stdout.write("loaded {} books".format(len(books))) diff --git a/core/management/commands/load_books_ku.py b/core/management/commands/load_books_ku.py new file mode 100644 index 000000000..58599be83 --- /dev/null +++ b/core/management/commands/load_books_ku.py @@ -0,0 +1,17 @@ +from django.core.management.base import BaseCommand + +from regluit.core.loaders.ku import load_ku, activate_ku_ebooks + +class Command(BaseCommand): + help = "load books from knowledge unlatched" + + def add_arguments(self, parser): + parser.add_argument('round', nargs='?', type=int, default=None, help="round to load") + + + def handle(self, round, **options): + books = load_ku(round) + self.stdout.write("loaded {} books".format(len(books))) + activated = activate_ku_ebooks() + self.stdout.write("activated {} ebooks".format(activated)) + diff --git a/core/management/commands/load_books_pbdata.py b/core/management/commands/load_books_pbdata.py new file mode 100644 index 000000000..4b5fa3b90 --- /dev/null +++ b/core/management/commands/load_books_pbdata.py @@ -0,0 +1,37 @@ +import json +from datetime import datetime + +from django.core.management.base import BaseCommand + +from regluit.core.loaders import add_by_metadata +from regluit.core.loaders.pressbooks import PressbooksScraper + +class Command(BaseCommand): + help = "load books from a json file from pressbooks" + def add_arguments(self, parser): + parser.add_argument('filename', help="filename") + parser.add_argument( + '--from', + action='store', + dest='from_date', + default='1-1-2000', + help='only read records after ', + ) + + def handle(self, filename, **options): + with open(filename, 'r') as jsonfile: + pb_metadata = json.load(jsonfile) + self.stdout.write(f'reading {len(pb_metadata)} records') + try: + from_date = datetime.strptime(options['from_date'], '%m-%d-%Y') + except ValueError: + from_date = datetime.strptime('1-1-2000', '%m-%d-%Y') + for record in pb_metadata: + if 'updated' in record: + updated = datetime.strptime(record['updated'], '%m-%d-%Y') + if updated < from_date: + continue + scraper = PressbooksScraper(record['url'], initial=record) + add_by_metadata(scraper.metadata) + + self.stdout.write("finished loading") diff --git a/core/management/commands/load_books_routledge.py b/core/management/commands/load_books_routledge.py new file mode 100644 index 000000000..f05aaa144 --- /dev/null +++ b/core/management/commands/load_books_routledge.py @@ -0,0 +1,11 @@ +from django.core.management.base import BaseCommand + +from regluit.core.loaders.routledge import load_routledge + +class Command(BaseCommand): + help = "load books from routledge" + + def handle(self, **options): + books = load_routledge() + self.stdout.write("loaded {} books".format(len(books))) + diff --git a/core/management/commands/load_books_springer.py b/core/management/commands/load_books_springer.py index a7dba9d81..93980e9a5 100644 --- a/core/management/commands/load_books_springer.py +++ b/core/management/commands/load_books_springer.py @@ -1,12 +1,32 @@ from django.core.management.base import BaseCommand -from regluit.core.loaders.springer import load_springer +from regluit.core.loaders.springer import load_springer, SpringerScraper +from regluit.core.bookloader import add_from_bookdatas class Command(BaseCommand): help = "load books from springer open" - args = " " + def add_arguments(self, parser): + parser.add_argument('startpage', nargs='?', type=int, default=1, help="page to start on") + parser.add_argument('endpage', nargs='?', type=int, default=1, help="page to end on") + parser.add_argument('--url', nargs='?', default='', help="url to scrape") + def handle(self, startpage, endpage=0, **options): - books = load_springer(int(startpage), int(endpage)) - print "loaded {} books".format(len(books)) + if options.get('url'): + books = add_from_bookdatas([SpringerScraper(options.get('url'))]) + else: + books = load_springer(int(startpage), int(endpage)) + self.stdout.write("loaded {} books".format(len(books))) + + for edition in books: + done_fmt = set() + for ebook in edition.work.ebooks_all(): + for fmt in ['pdf', 'epub', 'mobi']: + if ebook.format == fmt: + if fmt not in done_fmt: + ebook.activate() + done_fmt.add(fmt) + else: + ebook.deactivate() + diff --git a/core/management/commands/load_by_doab.py b/core/management/commands/load_by_doab.py index beb324838..0df98aca2 100644 --- a/core/management/commands/load_by_doab.py +++ b/core/management/commands/load_by_doab.py @@ -6,7 +6,7 @@ class Command(BaseCommand): help = "load doab books by doab_id via oai" def add_arguments(self, parser): - parser.add_argument('doab_ids', nargs='+', type=int, default=1, help="doab ids to add") + parser.add_argument('doab_ids', nargs='+', default=1, help="doab ids to add") def handle(self, doab_ids, **options): for doab_id in doab_ids: diff --git a/core/management/commands/load_doab.py b/core/management/commands/load_doab.py index 10856e65e..214c76c95 100644 --- a/core/management/commands/load_doab.py +++ b/core/management/commands/load_doab.py @@ -1,18 +1,29 @@ +import datetime from django.core.management.base import BaseCommand from regluit.core.loaders import doab +def timefromiso(datestring): + try: + return datetime.datetime.strptime(datestring, "%Y-%m-%d") + except: + return datetime.datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S") + class Command(BaseCommand): help = "load doab books via oai" - args = " " - - def handle(self, from_year= None, limit=None, **options): - from_year = int(from_year) if from_year else None - limit = int(limit) if limit else None - if limit: - doab.load_doab_oai(from_year=from_year, limit=limit) - else: - if from_year: - doab.load_doab_oai(from_year=from_year) - else: - doab.load_doab_oai() + + def add_arguments(self, parser): + parser.add_argument('from_date', nargs='?', type=timefromiso, + default=None, help="YYYY-MM-DD to start") + parser.add_argument('--until', nargs='?', type=timefromiso, + default=None, help="YYYY-MM-DD to end") + parser.add_argument('--max', nargs='?', type=int, default=None, help="max desired records") + + def handle(self, from_date, **options): + until_date = options['until'] + max = options['max'] + self.stdout.write('starting at date:{} until:{}, max: {}'.format( + from_date, until_date, max)) + records, new_doabs, last_time = doab.load_doab_oai(from_date, until_date, limit=max) + self.stdout.write('loaded {} records ({} new), ending at {}'.format( + records, new_doabs, last_time)) diff --git a/core/management/commands/load_edp.py b/core/management/commands/load_edp.py new file mode 100644 index 000000000..55961052c --- /dev/null +++ b/core/management/commands/load_edp.py @@ -0,0 +1,10 @@ +from django.core.management.base import BaseCommand + +from regluit.core.loaders.multiscrape import edp_scrape + + +class Command(BaseCommand): + help = "load books from edp-open" + + def handle(self, **options): + edp_scrape() diff --git a/core/management/commands/load_gutenberg.py b/core/management/commands/load_gutenberg.py deleted file mode 100644 index e99891f82..000000000 --- a/core/management/commands/load_gutenberg.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Load the Gutenberg editions - -""" - -from django.core.management.base import BaseCommand - -from regluit.core import models -from regluit.test import booktests - -class Command(BaseCommand): - help = "load Gutenberg editions" - args = "" - - def handle(self, max_num, **options): - - try: - max_num = int(max_num) - except: - max_num = None - - print "number of Gutenberg editions (before)", \ - models.Edition.objects.filter(identifiers__type='gtbg').count() - print "number of Gutenberg ebooks (before)", \ - models.Ebook.objects.filter(edition__identifiers__type='gtbg').count() - - booktests.load_gutenberg_books(max_num=max_num) - - print "number of Gutenberg editions (after)", \ - models.Edition.objects.filter(identifiers__type='gtbg').count() - print "number of Gutenberg ebooks (after)", \ - models.Ebook.objects.filter(edition__identifiers__type='gtbg').count() - diff --git a/core/management/commands/load_wishlist.py b/core/management/commands/load_wishlist.py deleted file mode 100644 index 14f452ca7..000000000 --- a/core/management/commands/load_wishlist.py +++ /dev/null @@ -1,21 +0,0 @@ -from django.core.management.base import BaseCommand -from django.contrib.auth.models import User - -from regluit.core import bookloader - -class Command(BaseCommand): - help = "populate a user's wishlist with books from a file of isbns" - args = " " - - def handle(self, filename, username, **options): - user = User.objects.get(username=username) - wishlist = user.wishlist - for isbn in open(filename): - isbn = isbn.strip() - edition = bookloader.add_by_isbn(isbn) - if edition: - bookloader.add_related(isbn) - user.wishlist.add_work(edition.work, source="user") - print "loaded %s as %s for %s" % (isbn, edition, user) - else: - print "failed to load book for %s" % isbn diff --git a/core/management/commands/make_missing_mobis.py b/core/management/commands/make_missing_mobis.py deleted file mode 100644 index 562aa9ccb..000000000 --- a/core/management/commands/make_missing_mobis.py +++ /dev/null @@ -1,44 +0,0 @@ -from django.core.management.base import BaseCommand -from regluit.core.models import Work, EbookFile - - -class Command(BaseCommand): - help = "generate mobi ebooks where needed and possible." - - def add_arguments(self, parser): - parser.add_argument('max', nargs='?', type=int, default=1, help="maximum mobis to make") - parser.add_argument('--reset', '-r', action='store_true', help="reset failed mobi conversions") - - - def handle(self, max=None, **options): - maxbad = 10 - if options['reset']: - bads = EbookFile.objects.filter(mobied__lt=0) - for bad in bads: - bad.mobied = 0 - bad.save() - - epubs = Work.objects.filter(editions__ebooks__format='epub').distinct().order_by('-id') - - i = 0 - n_bad = 0 - for work in epubs: - if not work.ebooks().filter(format="mobi"): - for ebook in work.ebooks().filter(format="epub"): - ebf = ebook.get_archive_ebf() - if ebf and ebf.mobied >= 0: - try: - print u'making mobi for {}'.format(work.title) - if ebf.make_mobi(): - print 'made mobi' - i += 1 - break - else: - self.stdout.write('failed to make mobi') - n_bad += 1 - - except: - self.stdout.write('failed to make mobi') - n_bad += 1 - if i >= max or n_bad >= maxbad: - break diff --git a/core/management/commands/no_isbn_works.py b/core/management/commands/no_isbn_works.py deleted file mode 100644 index 00921b500..000000000 --- a/core/management/commands/no_isbn_works.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -list works with no isbn -""" - -from django.core.management.base import BaseCommand -from django.db.models import Count - -from regluit.core import models - -class Command(BaseCommand): - help = "list works with no isbn. actions: count, list, wished" - args = "" - - - def handle(self, action='count', **options): - no_isbn_works=models.Work.objects.exclude(identifiers__type='isbn') - num=no_isbn_works.count() - print "%s works without isbn:"% num - if action=='list': - for work in no_isbn_works: - print "%s, %s"% (work.id, work.title) - elif action=='wished': - print "%s wished works without isbn:"% no_isbn_works.filter(num_wishes__gt=0).count() - for work in no_isbn_works.filter(num_wishes__gt=0): - print "%s, %s, %s"% (work.id, work.title, work.num_wishes) - - diff --git a/core/management/commands/old_campaign_stats.py b/core/management/commands/old_campaign_stats.py deleted file mode 100644 index 62ffa4db4..000000000 --- a/core/management/commands/old_campaign_stats.py +++ /dev/null @@ -1,59 +0,0 @@ -from django.core.management.base import BaseCommand -from django.db.models import Q, F, Count, Sum - -from regluit.core.models import Campaign - -STATS_TEMPLATE = """Total Pledged: {0} by {1} Pledgers - -Distribution of Pledges: - -{2} - -Premiums Offered: - -{3} - -Premiums Selected: - -{4} - -Number of Transactions without premiums selected: {5}""" - -def campaign_stats(c): - # Use aggregations: https://docs.djangoproject.com/en/dev/topics/db/aggregation/#cheat-sheet - - transactions = c.transaction_set.filter(Q(status='Canceled') & Q(reason ='Amazon FPS shutdown')) - - amount_sum = transactions.aggregate(Sum('amount'))['amount__sum'] - number_pledgers = transactions.count() - - # do we have unique - - amount_table = "Level\tCount\tTotal\n" + "\n".join(["{0}\t{1}\t{2}".format(k['amount'], k['count_amount'], k['amount']*k['count_amount']) for k in transactions.values('amount').annotate(count_amount=Count('amount')).order_by('-amount')]) - - # premiums offered - - premiums_offered = "id\tamount\tdescription\tcampaign_id\n" + "\n".join(["{0}\t{1}\t{2}\t{3}".format(p.id, p.amount, p.description, p.campaign_id) for p in c.effective_premiums()]) - - transactions_null_premiums_count = transactions.filter(premium__isnull=True).count() - -# list stats around premiums - - premium_selected = "Amount\tCount\tPrem. id\tDescription\n" + \ - "\n".join(["{0}\t{1}\t{2}\t{3}".format(k['premium__amount'], k['count_premium'], - k['premium'], k['premium__description']) for k in - transactions.filter(premium__isnull=False).values('premium', - 'premium__description', 'premium__amount').annotate(count_premium=Count( 'premium')).order_by('premium__amount')]) - - return(STATS_TEMPLATE.format(amount_sum, number_pledgers, amount_table, premiums_offered, premium_selected, transactions_null_premiums_count)) - -class Command(BaseCommand): - help = "Displays data about old campaigns" - # args = " " - - def handle(self, **options): - - # Melinda's campaign - c6 = Campaign.objects.get(id=6) - print campaign_stats(c6) - diff --git a/core/management/commands/random_campaigns.py b/core/management/commands/random_campaigns.py deleted file mode 100644 index 163b6a415..000000000 --- a/core/management/commands/random_campaigns.py +++ /dev/null @@ -1,48 +0,0 @@ -from datetime import timedelta -from decimal import Decimal as D -from random import randint, randrange - -from django.conf import settings -from django.core.management.base import BaseCommand -from django.utils.timezone import now - -from regluit.core.models import Work, Campaign - -class Command(BaseCommand): - help = "creates random campaigns for any works that lack one for testing" - - def handle(self, *args, **options): - for work in Work.objects.all(): - if work.campaigns.all().count() > 0: - continue - campaign = Campaign() - campaign.name = work.title - campaign.work = work - campaign.description = "Test Campaign" - - # random campaign target between $200 and $10,000 - campaign.target = D(randint(200,10000)) - - # add a test rightsholder recipient right now - campaign.paypal_receiver = settings.PAYPAL_TEST_RH_EMAIL - - # random deadline between 5 days from now and 180 days from now - _now = now() - campaign.deadline = random_date(_now + timedelta(days=5), - _now + timedelta(days=180)) - - # randomly activate some of the campaigns - coinflip = D(randint(0,10)) - if coinflip > 5: - campaign.activate() - - campaign.save() - print "campaign %s...status: %s" % (unicode(campaign).encode('ascii','replace') , campaign.status) - - -def random_date(start, end): - delta = end - start - int_delta = (delta.days * 24 * 60 * 60) + delta.seconds - random_second = randrange(int_delta) - return (start + timedelta(seconds=random_second)) - diff --git a/core/management/commands/random_tasks.py b/core/management/commands/random_tasks.py deleted file mode 100644 index 98246ffd2..000000000 --- a/core/management/commands/random_tasks.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -a command that creates a given number of random tasks to test out celery -""" - -import random - -from django.core.management.base import BaseCommand - -from regluit.core import tasks -from regluit.core.models import CeleryTask - -random.seed() - -class Command(BaseCommand): - help = "create random tasks" - args = "" - - def handle(self, num_tasks, action, **options): - """ - actions: - - c: create num_tasks tasks - s: print state of existing tasks - d: delete all tasks - an integer: compute factorial of the integer -- can then follow up with s to find the state - """ - import django - django.db.transaction.enter_transaction_management() - if action == 'c': - for i in xrange(int(num_tasks)): - n = random.randint(1,1000) - task_id = tasks.fac.delay(n) - - ct = CeleryTask() - ct.task_id = task_id - ct.function_name = 'fac' - ct.function_args = n - ct.description = "Factorial of %d" % (n) - ct.save() - elif action == 's': - for (i, ct) in enumerate(CeleryTask.objects.all()): - print i, ct.function_args, ct.state, ct.info - elif action == 'd': - CeleryTask.objects.all().delete() - else: - try: - action = int(action) - print 'action: %d' % (int(action)) - task_id = tasks.fac.delay(int(action),sleep_interval=0.001) - - ct = CeleryTask() - ct.task_id = task_id - ct.function_name = 'fac' - ct.function_args = action - ct.description = "Factorial of %d" % (action) - ct.save() - except Exception, e: - print e - django.db.transaction.commit() \ No newline at end of file diff --git a/core/management/commands/random_wishlists.py b/core/management/commands/random_wishlists.py deleted file mode 100644 index eb61b75cc..000000000 --- a/core/management/commands/random_wishlists.py +++ /dev/null @@ -1,20 +0,0 @@ -from django.contrib.auth.models import User -from django.core.management.base import BaseCommand - -from regluit.core.models import Work - -class Command(BaseCommand): - help = "creates random wishlists for any users" - - def handle(self, *args, **options): - for user in User.objects.all(): - print user - try: - if user.wishlist.works.all().count() != 0: - continue - for work in Work.objects.all(): - print "adding %s to %s's wishlist" % (work, user) - user.wishlist.add_work(work, 'random') - except Exception, e: - print e - pass diff --git a/core/management/commands/recluster_singletons.py b/core/management/commands/recluster_singletons.py index b8ee07a18..e41f2cd55 100644 --- a/core/management/commands/recluster_singletons.py +++ b/core/management/commands/recluster_singletons.py @@ -14,39 +14,56 @@ class Command(BaseCommand): help = "add and merge editions for singleton works" args = " " + def add_arguments(self, parser): + parser.add_argument('language', nargs='+', help="language code") + parser.add_argument('max', nargs='?', type='int', default=100, help="max singletons to process") + parser.add_argument('start', nargs='?', type='int', default=0, help="start") def handle(self, language, max=100, start=0, **options): - print "Number of singleton Works with language = %s: %s" % (language, models.Work.objects.annotate(num_editions=Count('editions')).filter(num_editions=1, language=language).count()) + self.stdout.write("Number of singleton Works with language = %s: %s" % ( + language, + models.Work.objects.annotate( + num_editions=Count('editions')).filter(num_editions=1, language=language).count() + ) + ) - try: - max = int(max) - except: - max = 100 - try: - start = int(start) - except: - start = 0 - - for (i, work) in enumerate(islice(models.Work.objects.annotate(num_editions=Count('editions')).filter(num_editions=1, language=language),start,start+max)): + for (i, work) in enumerate(islice( + models.Work.objects.annotate( + num_editions=Count('editions')).filter(num_editions=1, language=language), + start, + start + max + ) + ): #check that there's still only one edition - print "%d %s id:%s #editions:%d #isbn:%s -->" % (i, work.title.encode('ascii','ignore'), work.id, work.editions.count(), work.first_isbn_13()), - work_id=work.id + self.stdout.write("%d %s id:%s #editions:%d #isbn:%s -->" % ( + i, + work.title.encode('ascii','ignore'), + work.id, + work.editions.count(), + work.first_isbn_13(), + )) + work_id = work.id if work.editions.count() != 1: - print + self.stdout.write() continue isbn=work.first_isbn_13() if isbn: new_work = bookloader.relate_isbn( isbn ) if new_work is None: - print "failed to get edition" + self.stdout.write("failed to get edition") elif new_work.id != work_id: - print "added edition to work %s with %s editions" % (new_work.id, new_work.editions.count()) + self.stdout.write("added edition to work %s with %s editions" % (new_work.id, new_work.editions.count())) else: if work.editions.count()>1: - print "singleton joined to new edition" + self.stdout.write("singleton joined to new edition") else: - print "singleton edition not moved" + self.stdout.write("singleton edition not moved") else: - print "no ISBN for this work and therefore no new editions" - print "Updated Number of singleton Works with language = %s: %s" % (language,models.Work.objects.annotate(num_editions=Count('editions')).filter(num_editions=1, language=language).count() ) + self.stdout.write("no ISBN for this work and therefore no new editions") + self.stdout.write("Updated Number of singleton Works with language = %s: %s" % ( + language, + models.Work.objects.annotate( + num_editions=Count('editions')).filter(num_editions=1,language=language).count() + ) + ) diff --git a/core/management/commands/rectify_OLA_acknames.py b/core/management/commands/rectify_OLA_acknames.py deleted file mode 100644 index b5dc2c5fa..000000000 --- a/core/management/commands/rectify_OLA_acknames.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -one-time command to ensure transaction.ack_name for OLA returns something sensible -see https://github.com/Gluejar/regluit/pull/97#discussion_r2436193 -""" - -from django.core.management.base import BaseCommand -from regluit.core.models import Campaign -from regluit.payment.models import Transaction - -class Command(BaseCommand): - help = "make sure transaction.ack_name returns something sensible for OLA transactions" - - def handle(self, **options): - ola_campaign = Campaign.objects.filter(work__id=81834) - assert ola_campaign.count() == 1 - ola_campaign = ola_campaign[0] - ola_transactions = Transaction.objects.filter(campaign=ola_campaign) - for t in ola_transactions: - if t.anonymous: - t.extra.update({"ack_name": ''}) - else: - ack_name=t.extra.get("ack_name",'') - if not ack_name: - t.extra.update({"ack_name": t.user.username}) - t.extra.update({"ack_dedication": ''}) - t.save() diff --git a/core/management/commands/refresh_free.py b/core/management/commands/refresh_free.py new file mode 100644 index 000000000..d687c70ef --- /dev/null +++ b/core/management/commands/refresh_free.py @@ -0,0 +1,19 @@ +from django.core.management.base import BaseCommand +from django.db.models import Sum + +from regluit.core.models import Work + + + +class Command(BaseCommand): + '''remove works and editions without titles''' + help = "remove works and editions without titles" + + def handle(self, **options): + qs = Work.objects.annotate(num_free=Sum('editions__ebooks__active')).filter(num_free__gt=0) + for free in qs.filter(is_free=False): + self.stdout.write('freeing %s' % free.title) + free.is_free = True + for subject in free.subjects.all(): + subject.count_free() + free.save() diff --git a/core/management/commands/relookup_isbns.py b/core/management/commands/relookup_isbns.py index bc1db56cd..e6c4a7677 100644 --- a/core/management/commands/relookup_isbns.py +++ b/core/management/commands/relookup_isbns.py @@ -8,17 +8,18 @@ class Command(BaseCommand): help = "relookup all editions attached to language=xx works" - args = "" + def add_arguments(self, parser): + parser.add_argument('title', nargs='?', default='', help="start of title") def handle(self, title='', **options): - print "Number of Works with language=xx, title like %s: %s" % (title, models.Work.objects.filter(language='xx', title__istartswith=title).count()) - updated_num=0 + self.stdout.write("Number of Works with language=xx, title like %s: %s" % (title, models.Work.objects.filter(language='xx', title__istartswith=title).count())) + updated_num = 0 for work in models.Work.objects.filter(language='xx', title__istartswith=title): - print "updating work %s" % work + self.stdout.write("updating work %s" % work) for edition in work.editions.all(): - print "updating edition %s" % edition + self.stdout.write("updating edition %s" % edition) updated = bookloader.update_edition(edition) if updated.work.language!= 'xx': updated_num+=1 - print "Number of updated editions= %s" % updated_num + self.stdout.write("Number of updated editions= %s" % updated_num) diff --git a/core/management/commands/remove_404s.py b/core/management/commands/remove_404s.py new file mode 100644 index 000000000..bf98f944d --- /dev/null +++ b/core/management/commands/remove_404s.py @@ -0,0 +1,43 @@ +import requests +from django.core.management.base import BaseCommand + +from regluit.core.models import Ebook + +class Command(BaseCommand): + help = "check ebooks for 404s and remove if needed" + args = "<limit>" + + def add_arguments(self, parser): + parser.add_argument('limit', nargs='?', type=int, default=50, help="max to check") + parser.add_argument('--ebook', nargs='?', type=int, default=0, help="ebook to check") + parser.add_argument('--provider', nargs='?', default='', help="provider to check") + parser.add_argument('--format', nargs='?', default='online', help="format to check") + + def handle(self, limit=0, **options): + limit = int(limit) if limit else 0 + format = options.get('format') + if format == 'all': + onlines = Ebook.objects.all() + else: + onlines = Ebook.objects.filter(format=format) + if options.get('ebook'): + onlines = Ebook.objects.filter(id=options.get('ebook')) + elif options.get('provider'): + onlines = onlines.filter(provider=options.get('provider')) + removed = [] + done = 0 + for online in onlines: + if not online.ebook_files.exists(): + try: + r = requests.get(online.url) + if r.status_code == 404: + removed.append(online.edition.id) + self.stdout.write(online.edition.title) + online.delete() + except UnicodeDecodeError: + self.stdout.write("Encoding error for %s" % online.url) + done +=1 + if done >= limit or done >= 500: + break + self.stdout.write("%s ebooks checked" % done) + self.stdout.write("%s ebooks removed" % len(removed)) diff --git a/core/management/commands/remove_orphan_editions.py b/core/management/commands/remove_orphan_editions.py index fa63ae50b..94d7f626d 100644 --- a/core/management/commands/remove_orphan_editions.py +++ b/core/management/commands/remove_orphan_editions.py @@ -15,4 +15,4 @@ def handle(self, **options): edition.delete() deleted=deleted+1 numeditions=numeditions+1 - print "%s deleted from %s total" % (deleted, numeditions) + self.stdout.write("%s deleted from %s total" % (deleted, numeditions)) diff --git a/core/management/commands/remove_orphan_works.py b/core/management/commands/remove_orphan_works.py index ce1adbf14..1acc804de 100644 --- a/core/management/commands/remove_orphan_works.py +++ b/core/management/commands/remove_orphan_works.py @@ -15,4 +15,4 @@ def handle(self, **options): work.delete() deleted=deleted+1 numworks=numworks+1 - print "%s deleted from %s total" % (deleted, numworks) + self.stdout.write("%s deleted from %s total" % (deleted, numworks)) diff --git a/core/management/commands/seed_degruyter.html b/core/management/commands/seed_degruyter.html deleted file mode 100644 index 30b1aaadc..000000000 --- a/core/management/commands/seed_degruyter.html +++ /dev/null @@ -1,40 +0,0 @@ -<div class="launch_top" id="degruyter_countdown" style="font-size:20px;text-align:center;width:50%"></div> - -<h4>Help us unglue this book!</h4> -<p>De Gruyter has agreed to run an ungluing campaign for this book, if it can get enough support from ungluers like you. The target price will be $2100, after which the book will be free for everyone on earth to read, copy, and share, forever (under a Creative Commons <a href="https://creativecommons.org/licenses/by-nc-nd/3.0/">BY-NC-ND</a> license).</p> - -<p>They'll launch a campaign when 50 ungluers have wished for this book. Right now <span id="wisher_data"></span>. </p> - -<p id="cta"></p> - -<hr> - -<script type="text/javascript"> - var $j = jQuery.noConflict(); - $j(document).ready(function(){ - var countdown = 50 - numWishers; - if(countdown == 1) { - $j("#degruyter_countdown").html("Only 1 more ungluer to go!"); - } else { - $j("#degruyter_countdown").html(countdown + " ungluers to go"); - } - - if(numWishers == 1) { - var wisherDataText = "1 ungluer has wished for this book" - } else { - var wisherDataText = numWishers + " ungluers have wished for this book" - } - $j("#wisher_data").html(wisherDataText); - - if(isSupporter){ - if(numWishers == 1) { - var callToAction = "Thanks for getting things started! Will you ask your friends to join you?" - } else { - var callToAction = "Thanks for being one of them! Will you ask your friends to join you?" - } - } else { - var callToAction = "Won't you join in?" - } - $j("#cta").html(callToAction); - }); -</script> diff --git a/core/management/commands/seed_degruyter_templates.py b/core/management/commands/seed_degruyter_templates.py deleted file mode 100644 index 3604a923b..000000000 --- a/core/management/commands/seed_degruyter_templates.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -seed empty but initialized deGruyter books with something useful -""" - -from django.core.management.base import BaseCommand -from regluit.core.models import Work - -class Command(BaseCommand): - help = "Seed empty but initialized deGruyter books with something useful. Takes filename containing seed description as argument. Can be safely run more than once; will ignore books with descriptions." - - def handle(self, filename, **options): - books = Work.objects.filter(editions__publisher_name__id=4311, campaigns__status="INITIALIZED") - for book in books: - if not 'degruyter_countdown' in book.description: - """ - read in file and prepend to description - ignores descriptions that already start with the seed file - """ - seed_file = open(filename) - book.description = seed_file.read() + book.description - book.save() - seed_file.close() \ No newline at end of file diff --git a/core/management/commands/set_campaign_editions.py b/core/management/commands/set_campaign_editions.py deleted file mode 100644 index 0a7ff944d..000000000 --- a/core/management/commands/set_campaign_editions.py +++ /dev/null @@ -1,14 +0,0 @@ -from django.core.management.base import BaseCommand -from regluit.core.models import Campaign - -class Command(BaseCommand): - help = "set campaign edition for every campaign" - - def handle(self, **options): - fixed = 0 - for campaign in Campaign.objects.all(): - if not campaign.edition: - campaign.edition = campaign.work.editions.all()[0] - campaign.save() - fixed +=1 - print "{} campaign editions set".format(fixed) diff --git a/core/management/commands/set_key.py b/core/management/commands/set_key.py deleted file mode 100644 index 6ce71264d..000000000 --- a/core/management/commands/set_key.py +++ /dev/null @@ -1,12 +0,0 @@ -from django.core.management.base import BaseCommand -from regluit.core.models import Key - -class Command(BaseCommand): - help = "set a core.models.Key with name value" - args = "<name> <value>" - - def handle(self, name, value, **options): - (k, created) = Key.objects.get_or_create(name=name) - k.value = value - k.save() - diff --git a/core/management/commands/subjects_from_bic.py b/core/management/commands/subjects_from_bic.py new file mode 100644 index 000000000..ddbdf7caf --- /dev/null +++ b/core/management/commands/subjects_from_bic.py @@ -0,0 +1,18 @@ +import string +from django.core.management.base import BaseCommand +from regluit.core.models import Subject +from regluit.core.validation import explode_bic + +class Command(BaseCommand): + help = "explode compound bic subjects from doab" + + def handle(self, **options): + matches=0 + for subject in Subject.objects.filter(name__startswith='bic Book Indus'): + newsubs = explode_bic(subject.name) + for work in subject.works.all(): + for subsub in newsubs: + Subject.set_by_name(subsub, work) + subject.delete() + + self.stdout.write("bic headings exploded" ) diff --git a/core/management/commands/subjects_to_bisac.py b/core/management/commands/subjects_to_bisac.py index 679b6b1ab..31771e5f9 100644 --- a/core/management/commands/subjects_to_bisac.py +++ b/core/management/commands/subjects_to_bisac.py @@ -14,4 +14,4 @@ def handle(self, **options): subject.name = bisac_heading.full_label subject.save() matches += 1 - print "%s bisac headings converted" % matches + self.stdout.write("%s bisac headings converted" % matches) diff --git a/core/management/commands/translate_doab_ids.py b/core/management/commands/translate_doab_ids.py new file mode 100644 index 000000000..4de5ac94b --- /dev/null +++ b/core/management/commands/translate_doab_ids.py @@ -0,0 +1,74 @@ +import csv +import json +import boto3 +from botocore.exceptions import ClientError + +from django.conf import settings +from django.core.management.base import BaseCommand + +from regluit.core.models import Edition, Identifier + +s3 = boto3.resource('s3', + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) + + + +class Command(BaseCommand): + help = "translate doab ids to handles" + def add_arguments(self, parser): + parser.add_argument('filename', nargs='?', help="filename") + parser.add_argument('--old_id', nargs='?', default=None, help="id to translate") + + def handle(self, filename, **options): + self.stdout.write("doab ids to start: %s" % Identifier.objects.filter(type='doab').count()) + with open(filename, 'r') as jsonfile: + newdoab = json.loads(jsonfile.read()) + done = 0 + if options['old_id']: + to_do = Identifier.objects.filter(type='doab', value=options['old_id']) + else: + to_do = Identifier.objects.filter(type='doab') + for doab in to_do: + if doab.value.startswith("20.500.12854"): + continue + if doab.value in newdoab: + # already done + if Identifier.objects.filter(type='doab', value=newdoab[doab.value]).exists(): + doab.delete() + else: + old_cover_file_name = 'doab/%s/cover' % doab.value + new_cover_file_name = 'doab/%s' % newdoab[doab.value] + self.move_cover(old_cover_file_name, new_cover_file_name) + doab.value = newdoab[doab.value] + doab.save() + else: + doab.delete() + done += 1 + self.stdout.write("doab ids at end: %s" % Identifier.objects.filter(type='doab').count()) + self.stdout.write("done:: %s" % done) + + def move_cover(self, old_name, new_name): + if old_name == new_name: + return + old_url = "https://{}.s3.amazonaws.com/{}".format( + settings.AWS_STORAGE_BUCKET_NAME, old_name) + new_url = "https://{}.s3.amazonaws.com/{}".format( + settings.AWS_STORAGE_BUCKET_NAME, new_name) + copy_source = { + 'Bucket': settings.AWS_STORAGE_BUCKET_NAME, + 'Key': old_name + } + try: + s3.meta.client.copy_object( + CopySource=copy_source, + Bucket=settings.AWS_STORAGE_BUCKET_NAME, + Key=new_name, ACL='public-read') + + for ed in Edition.objects.filter(cover_image__contains=old_name): + ed.cover_image = new_url + ed.save() + + s3.meta.client.delete_object(Bucket=settings.AWS_STORAGE_BUCKET_NAME, Key=old_name,) + except ClientError: + self.stdout.write("problem moving %s to %s" % (old_name, new_name)) diff --git a/core/management/commands/update_downloads.py b/core/management/commands/update_downloads.py new file mode 100644 index 000000000..887e108b9 --- /dev/null +++ b/core/management/commands/update_downloads.py @@ -0,0 +1,55 @@ +import os +from datetime import datetime + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.db.models import F, Sum + +from regluit.core.models import Ebook + +DOWNLOAD_LOGFILE = settings.LOGGING['handlers']['downloads']['filename'] + +class Command(BaseCommand): + '''add logged downloads to ebook objects''' + help = "add logged downloads to ebook objects" + + def handle(self, **options): + dls = {} + date_format = "%Y-%m-%d" + + this_mo = datetime.today().month + last_month = this_mo - 1 + year = datetime.today().year + if last_month <= 0: + last_month = last_month + 12 + total = 0 + for suffix in ['', '.1','.2','.3','.4','.5', '.6', '.7', '.8',]: + fn = DOWNLOAD_LOGFILE + suffix + if os.path.exists(fn): + with open(fn,'r') as logfile: + for line in logfile.readlines(): + try: + (date, time, colon, ebook) = line.split() + except ValueError: + # garbage line + continue + month = datetime.strptime(date, date_format).date().month + if month == last_month: + dls[ebook] = dls.get(ebook, 0) + 1 + total += 1 + + downloads = Ebook.objects.aggregate(total=Sum('download_count'))['total'] + self.stdout.write(f'old count: {downloads} downloads') + self.stdout.write(f'logging {total} downloads for {len(dls)} ebooks') + + for key in dls.keys(): + if dls[key] > settings.DOWNLOAD_LOGS_MAX: + self.stdout.write(f'{dls[key]} downloads for ebook {key} discarded.' ) + continue + try: + Ebook.objects.filter(id=key).update(download_count=F('download_count') + dls[key]) + except Ebook.object.DoesNotExist: + self.stdout.write(f'ebook {key} not found') + + downloads = Ebook.objects.aggregate(total=Sum('download_count'))['total'] + self.stdout.write(f'new count: {downloads} downloads') diff --git a/core/management/commands/update_providers.py b/core/management/commands/update_providers.py new file mode 100644 index 000000000..8a24d749a --- /dev/null +++ b/core/management/commands/update_providers.py @@ -0,0 +1,39 @@ +from django.core.management.base import BaseCommand +from django.db.models import Q + +from regluit.core.loaders.harvest import dl_online, RateLimiter +from regluit.core.models import Ebook + +class Command(BaseCommand): + help = "recalculate provider from url" + args = "<limit>" + + def add_arguments(self, parser): + parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest") + + def handle(self, limit=0, **options): + done = 0 + limit = int(limit) if limit else 0 + unstripped = Ebook.objects.filter(Q(provider='') | Q(provider__startswith='www.')) + for ebook in unstripped: + ebook.url = ebook.url.strip() + new_provider = Ebook.infer_provider(ebook.url) + if new_provider != ebook.provider: + ebook.provider = new_provider + ebook.save() + done += 1 + self.stdout.write('{} urls or netloc stripped'.format(done)) + done = 0 + stale = Ebook.objects.filter(Q(provider__icontains='doi') | Q(provider='Handle Proxy')) + self.stdout.write('{} providers to update'.format(stale.count())) + for ebook in stale: + new_provider = Ebook.infer_provider(ebook.url) + if new_provider != ebook.provider: + ebook.provider = new_provider + ebook.save() + done += 1 + if done > limit or done >= 100: + break + self.stdout.write('{} ebooks updated'.format(done)) + if done == 100: + self.stdout.write('50 is the maximum; repeat to do more') diff --git a/core/management/commands/zap_frankenworks.py b/core/management/commands/zap_frankenworks.py deleted file mode 100644 index 850c2aef9..000000000 --- a/core/management/commands/zap_frankenworks.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Dispose of the Frankenworks and recluster the works. Print out email addresses of those whose wishlists have been -affected. -""" - -from django.core.management.base import BaseCommand -from regluit.test import booktests - -class Command(BaseCommand): - help = "Dispose of the Frankenworks and recluster the works. Print out email addresses of those whose wishlists have been affected." - args = "<do>" - - def handle(self, do, **options): - - try: - do = str(do) - if do.lower() == 'true': - do = True - else: - do = False - except: - do = False - - print "before..." - s = booktests.cluster_status() - print s['results'] - - booktests.clean_frankenworks(s, do=do) - s = booktests.cluster_status() - print "after cleanup...." - print "results ", s['results'] - print "scattered clusters ", s['scattered_clusters'] - print "franken works", s['franken_works'] diff --git a/core/migrations/0001_initial.py b/core/migrations/0001_initial.py index 3c13100e5..7e753d38c 100644 --- a/core/migrations/0001_initial.py +++ b/core/migrations/0001_initial.py @@ -152,7 +152,7 @@ class Migration(migrations.Migration): ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('type', models.CharField(max_length=4)), ('value', models.CharField(max_length=250)), - ('edition', models.ForeignKey(related_name='identifiers', to='core.Edition', null=True)), + ('edition', models.ForeignKey(on_delete=models.CASCADE, related_name='identifiers', to='core.Edition', null=True)), ], ), migrations.CreateModel( @@ -168,7 +168,7 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('marc_link_target', models.CharField(default=b'UNGLUE', max_length=6, verbose_name=b'MARC record link targets', choices=[(b'DIRECT', b'Raw link'), (b'UNGLUE', b'Unglue.it link')])), - ('user', models.OneToOneField(related_name='libpref', to=settings.AUTH_USER_MODEL)), + ('user', models.OneToOneField(on_delete=models.CASCADE, related_name='libpref', to=settings.AUTH_USER_MODEL)), ], ), migrations.CreateModel( @@ -189,7 +189,7 @@ class Migration(migrations.Migration): ('amount', models.DecimalField(max_digits=10, decimal_places=0)), ('description', models.TextField(null=True)), ('limit', models.IntegerField(default=0)), - ('campaign', models.ForeignKey(related_name='premiums', to='core.Campaign', null=True)), + ('campaign', models.ForeignKey(on_delete=models.CASCADE, related_name='premiums', to='core.Campaign', null=True)), ], ), migrations.CreateModel( @@ -220,7 +220,7 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('name', models.CharField(unique=True, max_length=255)), - ('publisher', models.ForeignKey(related_name='alternate_names', to='core.Publisher', null=True)), + ('publisher', models.ForeignKey(on_delete=models.CASCADE, related_name='alternate_names', to='core.Publisher', null=True)), ], ), migrations.CreateModel( @@ -235,9 +235,9 @@ class Migration(migrations.Migration): name='Relator', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), - ('author', models.ForeignKey(to='core.Author')), - ('edition', models.ForeignKey(related_name='relators', to='core.Edition')), - ('relation', models.ForeignKey(default=1, to='core.Relation')), + ('author', models.ForeignKey(on_delete=models.CASCADE, to='core.Author')), + ('edition', models.ForeignKey(on_delete=models.CASCADE, related_name='relators', to='core.Edition')), + ('relation', models.ForeignKey(on_delete=models.CASCADE, default=1, to='core.Relation')), ], options={ 'db_table': 'core_author_editions', @@ -251,7 +251,7 @@ class Migration(migrations.Migration): ('email', models.CharField(max_length=100, blank=True)), ('rights_holder_name', models.CharField(max_length=100)), ('can_sell', models.BooleanField(default=False)), - ('owner', models.ForeignKey(related_name='rights_holder', to=settings.AUTH_USER_MODEL)), + ('owner', models.ForeignKey(on_delete=models.CASCADE, related_name='rights_holder', to=settings.AUTH_USER_MODEL)), ], ), migrations.CreateModel( @@ -286,7 +286,7 @@ class Migration(migrations.Migration): ('goodreads_user_link', models.CharField(max_length=200, null=True, blank=True)), ('avatar_source', models.PositiveSmallIntegerField(default=4, null=True, choices=[(0, b'No Avatar, Please'), (1, b'Gravatar'), (2, b'Twitter'), (3, b'Facebook'), (4, b'Unglueitar')])), ('badges', models.ManyToManyField(related_name='holders', to='core.Badge')), - ('user', models.OneToOneField(related_name='profile', to=settings.AUTH_USER_MODEL)), + ('user', models.OneToOneField(on_delete=models.CASCADE, related_name='profile', to=settings.AUTH_USER_MODEL)), ], ), migrations.CreateModel( @@ -295,7 +295,7 @@ class Migration(migrations.Migration): ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('was', models.IntegerField(unique=True)), ('moved', models.DateTimeField(auto_now_add=True)), - ('user', models.ForeignKey(to=settings.AUTH_USER_MODEL, null=True)), + ('user', models.ForeignKey(on_delete=models.CASCADE, to=settings.AUTH_USER_MODEL, null=True)), ], ), migrations.CreateModel( @@ -314,7 +314,7 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('created', models.DateTimeField(auto_now_add=True)), - ('user', models.OneToOneField(related_name='wishlist', to=settings.AUTH_USER_MODEL)), + ('user', models.OneToOneField(on_delete=models.CASCADE, related_name='wishlist', to=settings.AUTH_USER_MODEL)), ], ), migrations.CreateModel( @@ -330,7 +330,7 @@ class Migration(migrations.Migration): ('publication_range', models.CharField(max_length=50, null=True)), ('featured', models.DateTimeField(db_index=True, null=True, blank=True)), ('is_free', models.BooleanField(default=False)), - ('selected_edition', models.ForeignKey(related_name='selected_works', to='core.Edition', null=True)), + ('selected_edition', models.ForeignKey(on_delete=models.CASCADE, related_name='selected_works', to='core.Edition', null=True)), ], options={ 'ordering': ['title'], @@ -344,17 +344,17 @@ class Migration(migrations.Migration): migrations.AddField( model_name='wishes', name='wishlist', - field=models.ForeignKey(to='core.Wishlist'), + field=models.ForeignKey(on_delete=models.CASCADE, to='core.Wishlist'), ), migrations.AddField( model_name='wishes', name='work', - field=models.ForeignKey(related_name='wishes', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='wishes', to='core.Work'), ), migrations.AddField( model_name='waswork', name='work', - field=models.ForeignKey(to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, to='core.Work'), ), migrations.AddField( model_name='subject', @@ -364,16 +364,16 @@ class Migration(migrations.Migration): migrations.AddField( model_name='publisher', name='name', - field=models.ForeignKey(related_name='key_publisher', to='core.PublisherName'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='key_publisher', to='core.PublisherName'), ), migrations.AddField( model_name='offer', name='work', - field=models.ForeignKey(related_name='offers', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='offers', to='core.Work'), ), migrations.AddField( model_name='identifier', name='work', - field=models.ForeignKey(related_name='identifiers', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='identifiers', to='core.Work'), ), ] diff --git a/core/migrations/0002_auto_20160722_1716.py b/core/migrations/0002_auto_20160722_1716.py index 6c8eaedfb..f6ff8dc46 100644 --- a/core/migrations/0002_auto_20160722_1716.py +++ b/core/migrations/0002_auto_20160722_1716.py @@ -18,82 +18,82 @@ class Migration(migrations.Migration): migrations.AddField( model_name='hold', name='library', - field=models.ForeignKey(related_name='holds', to='libraryauth.Library'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='holds', to='libraryauth.Library'), ), migrations.AddField( model_name='hold', name='user', - field=models.ForeignKey(related_name='holds', to=settings.AUTH_USER_MODEL), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='holds', to=settings.AUTH_USER_MODEL), ), migrations.AddField( model_name='hold', name='work', - field=models.ForeignKey(related_name='holds', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='holds', to='core.Work'), ), migrations.AddField( model_name='gift', name='acq', - field=models.ForeignKey(related_name='gifts', to='core.Acq'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='gifts', to='core.Acq'), ), migrations.AddField( model_name='gift', name='giver', - field=models.ForeignKey(related_name='gifts', to=settings.AUTH_USER_MODEL), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='gifts', to=settings.AUTH_USER_MODEL), ), migrations.AddField( model_name='edition', name='publisher_name', - field=models.ForeignKey(related_name='editions', to='core.PublisherName', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='editions', to='core.PublisherName', null=True), ), migrations.AddField( model_name='edition', name='work', - field=models.ForeignKey(related_name='editions', to='core.Work', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='editions', to='core.Work', null=True), ), migrations.AddField( model_name='ebookfile', name='edition', - field=models.ForeignKey(related_name='ebook_files', to='core.Edition'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='ebook_files', to='core.Edition'), ), migrations.AddField( model_name='ebook', name='edition', - field=models.ForeignKey(related_name='ebooks', to='core.Edition'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='ebooks', to='core.Edition'), ), migrations.AddField( model_name='ebook', name='user', - field=models.ForeignKey(to=settings.AUTH_USER_MODEL, null=True), + field=models.ForeignKey(on_delete=models.CASCADE, to=settings.AUTH_USER_MODEL, null=True), ), migrations.AddField( model_name='claim', name='rights_holder', - field=models.ForeignKey(related_name='claim', to='core.RightsHolder'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='claim', to='core.RightsHolder'), ), migrations.AddField( model_name='claim', name='user', - field=models.ForeignKey(related_name='claim', to=settings.AUTH_USER_MODEL), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='claim', to=settings.AUTH_USER_MODEL), ), migrations.AddField( model_name='claim', name='work', - field=models.ForeignKey(related_name='claim', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='claim', to='core.Work'), ), migrations.AddField( model_name='celerytask', name='user', - field=models.ForeignKey(related_name='tasks', to=settings.AUTH_USER_MODEL, null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='tasks', to=settings.AUTH_USER_MODEL, null=True), ), migrations.AddField( model_name='campaignaction', name='campaign', - field=models.ForeignKey(related_name='actions', to='core.Campaign'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='actions', to='core.Campaign'), ), migrations.AddField( model_name='campaign', name='edition', - field=models.ForeignKey(related_name='campaigns', to='core.Edition', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='campaigns', to='core.Edition', null=True), ), migrations.AddField( model_name='campaign', @@ -103,12 +103,12 @@ class Migration(migrations.Migration): migrations.AddField( model_name='campaign', name='publisher', - field=models.ForeignKey(related_name='campaigns', to='core.Publisher', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='campaigns', to='core.Publisher', null=True), ), migrations.AddField( model_name='campaign', name='work', - field=models.ForeignKey(related_name='campaigns', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='campaigns', to='core.Work'), ), migrations.AddField( model_name='author', @@ -118,22 +118,22 @@ class Migration(migrations.Migration): migrations.AddField( model_name='acq', name='lib_acq', - field=models.ForeignKey(related_name='loans', to='core.Acq', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='loans', to='core.Acq', null=True), ), migrations.AddField( model_name='acq', name='user', - field=models.ForeignKey(related_name='acqs', to=settings.AUTH_USER_MODEL), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='acqs', to=settings.AUTH_USER_MODEL), ), migrations.AddField( model_name='acq', name='watermarked', - field=models.ForeignKey(to='booxtream.Boox', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, to='booxtream.Boox', null=True), ), migrations.AddField( model_name='acq', name='work', - field=models.ForeignKey(related_name='acqs', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='acqs', to='core.Work'), ), migrations.AlterUniqueTogether( name='identifier', diff --git a/core/migrations/0003_auto_20160816_1645.py b/core/migrations/0003_auto_20160816_1645.py index 1ae7aa686..b123b5fc5 100644 --- a/core/migrations/0003_auto_20160816_1645.py +++ b/core/migrations/0003_auto_20160816_1645.py @@ -38,17 +38,17 @@ class Migration(migrations.Migration): migrations.AddField( model_name='workrelation', name='from_work', - field=models.ForeignKey(related_name='works_related_from', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='works_related_from', to='core.Work'), ), migrations.AddField( model_name='workrelation', name='to_work', - field=models.ForeignKey(related_name='works_related_to', to='core.Work'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='works_related_to', to='core.Work'), ), migrations.AddField( model_name='edition', name='note', - field=models.ForeignKey(to='core.EditionNote', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, to='core.EditionNote', null=True), ), migrations.AddField( model_name='work', diff --git a/core/migrations/0005_ebookfile_ebook.py b/core/migrations/0005_ebookfile_ebook.py index c738940b8..923a5b104 100644 --- a/core/migrations/0005_ebookfile_ebook.py +++ b/core/migrations/0005_ebookfile_ebook.py @@ -14,6 +14,6 @@ class Migration(migrations.Migration): migrations.AddField( model_name='ebookfile', name='ebook', - field=models.ForeignKey(related_name='ebook_files', to='core.Ebook', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='ebook_files', to='core.Ebook', null=True), ), ] diff --git a/core/migrations/0006_auto_20160818_1809.py b/core/migrations/0006_auto_20160818_1809.py index 5b3cbe4c1..ad093254d 100644 --- a/core/migrations/0006_auto_20160818_1809.py +++ b/core/migrations/0006_auto_20160818_1809.py @@ -39,7 +39,7 @@ def add_ebooks_to_ebfs(apps, schema_editor): elif ebf.edition.work.campaigns.filter(type=2): pass else: - print 'ebf {} is dangling'.format(ebf.id) + print('ebf {} is dangling'.format(ebf.id)) def noop(apps, schema_editor): pass diff --git a/core/migrations/0007_auto_20160923_1314.py b/core/migrations/0007_auto_20160923_1314.py index 3f1ee8b25..8feb68fec 100644 --- a/core/migrations/0007_auto_20160923_1314.py +++ b/core/migrations/0007_auto_20160923_1314.py @@ -28,12 +28,12 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='edition', name='note', - field=models.ForeignKey(blank=True, to='core.EditionNote', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, blank=True, to='core.EditionNote', null=True), ), migrations.AlterField( model_name='edition', name='publisher_name', - field=models.ForeignKey(related_name='editions', blank=True, to='core.PublisherName', null=True), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='editions', blank=True, to='core.PublisherName', null=True), ), migrations.AlterField( model_name='userprofile', diff --git a/core/migrations/0015_auto_20180720_1413.py b/core/migrations/0015_auto_20180720_1413.py new file mode 100644 index 000000000..e9bf39bfc --- /dev/null +++ b/core/migrations/0015_auto_20180720_1413.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0014_auto_20180618_1646'), + ] + + operations = [ + migrations.AlterField( + model_name='userprofile', + name='avatar_source', + field=models.PositiveSmallIntegerField(default=4, null=True, choices=[(0, b'No Avatar, Please'), (1, b'Gravatar'), (2, b'Twitter/Facebook'), (4, b'Unglueitar')]), + ), + migrations.AlterField( + model_name='userprofile', + name='facebook_id', + field=models.CharField(default='', max_length=31, blank=True), + preserve_default=False, + ), + ] diff --git a/core/migrations/0016_auto_20181108_1646.py b/core/migrations/0016_auto_20181108_1646.py new file mode 100644 index 000000000..27fd29c8e --- /dev/null +++ b/core/migrations/0016_auto_20181108_1646.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.14 on 2018-11-08 16:46 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0015_auto_20180720_1413'), + ] + + operations = [ + migrations.RunSQL( + ['CREATE FULLTEXT INDEX core_work_index ON core_work (title);'], + ['DROP INDEX core_work_index on core_work;'], + ), + migrations.RunSQL( + ['CREATE FULLTEXT INDEX core_author_index ON core_author (name);'], + ['DROP INDEX core_author_index on core_author;'], + ), + ] diff --git a/core/migrations/0017_auto_20190227_1457.py b/core/migrations/0017_auto_20190227_1457.py new file mode 100644 index 000000000..c372feae8 --- /dev/null +++ b/core/migrations/0017_auto_20190227_1457.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.14 on 2019-02-27 14:57 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0016_auto_20181108_1646'), + ] + + operations = [ + migrations.AddIndex( + model_name='edition', + index=models.Index(fields=['work'], name='core_editio_work_id_3ae536_idx'), + ), + migrations.AddIndex( + model_name='author', + index=models.Index(fields=['name'], name='core_author_name_fca240_idx'), + ), + migrations.AddIndex( + model_name='subject', + index=models.Index(fields=['name'], name='core_subjec_name_36111e_idx'), + ), + migrations.AddIndex( + model_name='work', + index=models.Index(fields=['is_free', 'title'], name='core_work_is_free_1e4d06_idx'), + ), + ] diff --git a/core/migrations/0018_auto_20200214_1347.py b/core/migrations/0018_auto_20200214_1347.py new file mode 100644 index 000000000..8583ef713 --- /dev/null +++ b/core/migrations/0018_auto_20200214_1347.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.28 on 2020-02-14 13:47 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0017_auto_20190227_1457'), + ] + + operations = [ + migrations.AlterField( + model_name='acq', + name='license', + field=models.PositiveSmallIntegerField(choices=[(1, 'Individual license'), (2, 'Library License'), (3, 'Borrowed from Library'), (0, 'Just for Testing'), (4, 'On Reserve'), (5, 'Already Thanked')], default=1), + ), + migrations.AlterField( + model_name='badge', + name='description', + field=models.TextField(default='', null=True), + ), + migrations.AlterField( + model_name='campaign', + name='license', + field=models.CharField(choices=[('CC BY-NC-ND', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)'), ('CC BY-NC-SA', 'Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)'), ('CC BY-NC', 'Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0)'), ('CC BY-ND', 'Creative Commons Attribution-NoDerivs 3.0 Unported (CC BY-ND 3.0)'), ('CC BY-SA', 'Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)'), ('CC BY', 'Creative Commons Attribution 3.0 Unported (CC BY 3.0)'), ('CC0', 'No Rights Reserved (CC0)'), ('GFDL', 'GNU Free Documentation License'), ('LAL', 'Licence Art Libre'), ('OSI', 'OSI Approved License')], default='CC BY-NC-ND', max_length=255), + ), + migrations.AlterField( + model_name='campaign', + name='status', + field=models.CharField(choices=[('INITIALIZED', 'INITIALIZED'), ('ACTIVE', 'ACTIVE'), ('SUSPENDED', 'SUSPENDED'), ('WITHDRAWN', 'WITHDRAWN'), ('SUCCESSFUL', 'SUCCESSFUL'), ('UNSUCCESSFUL', 'UNSUCCESSFUL')], db_index=True, default='INITIALIZED', max_length=15, null=True), + ), + migrations.AlterField( + model_name='campaign', + name='type', + field=models.PositiveSmallIntegerField(choices=[(1, 'Pledge-to-unglue campaign'), (2, 'Buy-to-unglue campaign'), (3, 'Thanks-for-ungluing campaign')], default=1), + ), + migrations.AlterField( + model_name='claim', + name='status', + field=models.CharField(choices=[('active', 'Claim has been accepted.'), ('pending', 'Claim is pending acceptance.'), ('release', 'Claim has not been accepted.')], default='active', max_length=7), + ), + migrations.AlterField( + model_name='ebook', + name='format', + field=models.CharField(choices=[('pdf', 'PDF'), ('epub', 'EPUB'), ('html', 'HTML'), ('text', 'TEXT'), ('mobi', 'MOBI')], max_length=25), + ), + migrations.AlterField( + model_name='ebook', + name='rights', + field=models.CharField(choices=[('CC BY-NC-ND', 'Creative Commons Attribution-NonCommercial-NoDerivs'), ('CC BY-NC-SA', 'Creative Commons Attribution-NonCommercial-ShareAlike'), ('CC BY-NC', 'Creative Commons Attribution-NonCommercial'), ('CC BY-ND', 'Creative Commons Attribution-NoDerivs'), ('CC BY-SA', 'Creative Commons Attribution-ShareAlike'), ('CC BY', 'Creative Commons Attribution'), ('CC0', 'No Rights Reserved (CC0)'), ('GFDL', 'GNU Free Documentation License'), ('LAL', 'Licence Art Libre'), ('OSI', 'OSI Approved License'), ('PD-US', 'Public Domain, US')], db_index=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='ebook', + name='version_label', + field=models.CharField(blank=True, default='', max_length=255), + ), + migrations.AlterField( + model_name='ebookfile', + name='format', + field=models.CharField(choices=[('pdf', 'PDF'), ('epub', 'EPUB'), ('html', 'HTML'), ('text', 'TEXT'), ('mobi', 'MOBI')], max_length=25), + ), + migrations.AlterField( + model_name='gift', + name='message', + field=models.TextField(default='', max_length=512), + ), + migrations.AlterField( + model_name='libpref', + name='marc_link_target', + field=models.CharField(choices=[('DIRECT', 'Raw link'), ('UNGLUE', 'Unglue.it link')], default='UNGLUE', max_length=6, verbose_name='MARC record link targets'), + ), + migrations.AlterField( + model_name='offer', + name='license', + field=models.PositiveSmallIntegerField(choices=[(1, 'Individual license'), (2, 'Library License')], default=1), + ), + migrations.AlterField( + model_name='publisher', + name='description', + field=models.TextField(blank=True, default='', null=True), + ), + migrations.AlterField( + model_name='rightsholder', + name='address', + field=models.CharField(default='', max_length=400), + ), + migrations.AlterField( + model_name='rightsholder', + name='email', + field=models.CharField(default='', max_length=100), + ), + migrations.AlterField( + model_name='rightsholder', + name='mailing', + field=models.CharField(default='', max_length=400), + ), + migrations.AlterField( + model_name='rightsholder', + name='signature', + field=models.CharField(default='', max_length=100), + ), + migrations.AlterField( + model_name='rightsholder', + name='signer', + field=models.CharField(default='', max_length=100), + ), + migrations.AlterField( + model_name='rightsholder', + name='signer_title', + field=models.CharField(default='', max_length=30), + ), + migrations.AlterField( + model_name='subject', + name='authority', + field=models.CharField(default='', max_length=10), + ), + migrations.AlterField( + model_name='userprofile', + name='avatar_source', + field=models.PositiveSmallIntegerField(choices=[(0, 'No Avatar, Please'), (1, 'Gravatar'), (2, 'Twitter/Facebook'), (4, 'Unglueitar')], default=4, null=True), + ), + migrations.AlterField( + model_name='work', + name='age_level', + field=models.CharField(blank=True, choices=[('', 'No Rating'), ('5-6', "Children's - Kindergarten, Age 5-6"), ('6-7', "Children's - Grade 1-2, Age 6-7"), ('7-8', "Children's - Grade 2-3, Age 7-8"), ('8-9', "Children's - Grade 3-4, Age 8-9"), ('9-11', "Children's - Grade 4-6, Age 9-11"), ('12-14', 'Teen - Grade 7-9, Age 12-14'), ('15-18', 'Teen - Grade 10-12, Age 15-18'), ('18-', 'Adult/Advanced Reader')], default='', max_length=5), + ), + migrations.AlterField( + model_name='work', + name='description', + field=models.TextField(blank=True, default='', null=True), + ), + migrations.AlterField( + model_name='work', + name='language', + field=models.CharField(db_index=True, default='en', max_length=5), + ), + migrations.AlterField( + model_name='workrelation', + name='relation', + field=models.CharField(choices=[('translation', 'translation'), ('revision', 'revision'), ('sequel', 'sequel'), ('part', 'part')], max_length=15), + ), + ] diff --git a/core/migrations/0019_delete_key.py b/core/migrations/0019_delete_key.py new file mode 100644 index 000000000..1daa9f94c --- /dev/null +++ b/core/migrations/0019_delete_key.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.28 on 2020-02-17 15:08 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0018_auto_20200214_1347'), + ] + + operations = [ + migrations.DeleteModel( + name='Key', + ), + ] diff --git a/core/migrations/0020_auto_20200720_1319.py b/core/migrations/0020_auto_20200720_1319.py new file mode 100644 index 000000000..e9e82aaf1 --- /dev/null +++ b/core/migrations/0020_auto_20200720_1319.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2020-07-20 13:19 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0019_delete_key'), + ] + + operations = [ + migrations.RemoveField( + model_name='userprofile', + name='facebook_id', + ), + migrations.AlterField( + model_name='userprofile', + name='avatar_source', + field=models.PositiveSmallIntegerField(choices=[(0, 'No Avatar, Please'), (1, 'Gravatar'), (2, 'Twitter'), (4, 'Unglueitar')], default=4, null=True), + ), + ] diff --git a/core/migrations/0021_auto_20200806_1711.py b/core/migrations/0021_auto_20200806_1711.py new file mode 100644 index 000000000..4a2a086b1 --- /dev/null +++ b/core/migrations/0021_auto_20200806_1711.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2020-08-06 17:11 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0020_auto_20200720_1319'), + ] + + operations = [ + migrations.AlterField( + model_name='workrelation', + name='relation', + field=models.CharField(choices=[('translation', 'translation'), ('revision', 'revision'), ('sequel', 'sequel'), ('part', 'part'), ('unspecified', 'unspecified')], max_length=15), + ), + ] diff --git a/core/migrations/0022_auto_20200812_1247.py b/core/migrations/0022_auto_20200812_1247.py new file mode 100644 index 000000000..3a073fb25 --- /dev/null +++ b/core/migrations/0022_auto_20200812_1247.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2020-08-12 12:47 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0021_auto_20200806_1711'), + ] + + operations = [ + migrations.AlterField( + model_name='ebook', + name='format', + field=models.CharField(choices=[('pdf', 'PDF'), ('epub', 'EPUB'), ('html', 'HTML'), ('text', 'TEXT'), ('mobi', 'MOBI'), ('online', 'Online Only')], max_length=25), + ), + ] diff --git a/core/migrations/0023_auto_20201210_1508.py b/core/migrations/0023_auto_20201210_1508.py new file mode 100644 index 000000000..961943486 --- /dev/null +++ b/core/migrations/0023_auto_20201210_1508.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2020-12-10 15:08 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0022_auto_20200812_1247'), + ] + + operations = [ + migrations.RemoveField( + model_name='userprofile', + name='goodreads_auth_secret', + ), + migrations.RemoveField( + model_name='userprofile', + name='goodreads_auth_token', + ), + migrations.RemoveField( + model_name='userprofile', + name='goodreads_user_id', + ), + migrations.RemoveField( + model_name='userprofile', + name='goodreads_user_link', + ), + migrations.RemoveField( + model_name='userprofile', + name='goodreads_user_name', + ), + ] diff --git a/core/migrations/0024_auto_20210503_1717.py b/core/migrations/0024_auto_20210503_1717.py new file mode 100644 index 000000000..5c60aebfc --- /dev/null +++ b/core/migrations/0024_auto_20210503_1717.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2021-05-03 17:17 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0023_auto_20201210_1508'), + ] + + operations = [ + migrations.AlterField( + model_name='ebookfile', + name='source', + field=models.URLField(blank=True, max_length=1024, null=True), + ), + ] diff --git a/core/migrations/0025_remove_ebookfile_mobied.py b/core/migrations/0025_remove_ebookfile_mobied.py new file mode 100644 index 000000000..ea4384a78 --- /dev/null +++ b/core/migrations/0025_remove_ebookfile_mobied.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2022-07-28 06:16 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_auto_20210503_1717'), + ] + + operations = [ + migrations.RemoveField( + model_name='ebookfile', + name='mobied', + ), + ] diff --git a/core/migrations/0026_auto_20230105_2031.py b/core/migrations/0026_auto_20230105_2031.py new file mode 100644 index 000000000..5a98cb6d5 --- /dev/null +++ b/core/migrations/0026_auto_20230105_2031.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2023-01-05 20:31 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_remove_ebookfile_mobied'), + ] + + operations = [ + migrations.RemoveField( + model_name='userprofile', + name='twitter_id', + ), + migrations.AlterField( + model_name='userprofile', + name='avatar_source', + field=models.PositiveSmallIntegerField(choices=[(0, 'No Avatar, Please'), (1, 'Gravatar'), (4, 'Unglueitar')], default=4, null=True), + ), + ] diff --git a/core/migrations/0027_subject_num_free.py b/core/migrations/0027_subject_num_free.py new file mode 100644 index 000000000..4bf2acd10 --- /dev/null +++ b/core/migrations/0027_subject_num_free.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2024-08-19 14:39 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0026_auto_20230105_2031'), + ] + + operations = [ + migrations.AddField( + model_name='subject', + name='num_free', + field=models.IntegerField(default=0), + ), + ] diff --git a/core/migrations/0028_auto_20240819_1450.py b/core/migrations/0028_auto_20240819_1450.py new file mode 100644 index 000000000..6250c73da --- /dev/null +++ b/core/migrations/0028_auto_20240819_1450.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2024-08-19 14:50 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + def count_free(apps, schema_editor): + """ + Now that subject has num_free filed, populate it + """ + Subject = apps.get_model('core', 'Subject') + for subject in Subject.objects.all(): + subject.num_free = subject.works.filter(is_free=True).count() + subject.save() + + def noop(apps, schema_editor): + pass + + dependencies = [ + ('core', '0027_subject_num_free'), + ] + + operations = [ + migrations.RunPython(count_free, reverse_code=noop, hints={'core': 'Subject'}), + ] diff --git a/core/migrations/0029_auto_20241122_1525.py b/core/migrations/0029_auto_20241122_1525.py new file mode 100644 index 000000000..9ab5b13ea --- /dev/null +++ b/core/migrations/0029_auto_20241122_1525.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.29 on 2024-11-22 15:25 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0028_auto_20240819_1450'), + ] + + operations = [ + migrations.AlterField( + model_name='subject', + name='name', + field=models.CharField(db_index=True, max_length=200, unique=True), + ), + migrations.AlterField( + model_name='work', + name='is_free', + field=models.BooleanField(db_index=True, default=False), + ), + ] diff --git a/core/mobi.py b/core/mobi.py deleted file mode 100644 index b26d43e6b..000000000 --- a/core/mobi.py +++ /dev/null @@ -1,32 +0,0 @@ -import requests -from django.conf import settings - -mobigen_url = settings.MOBIGEN_URL -mobigen_user_id = settings.MOBIGEN_USER_ID -mobigen_password = settings.MOBIGEN_PASSWORD - - - -def convert_to_mobi(input_url, input_format="application/epub+zip"): - - """ - return a string with the output of mobigen computation - - """ - if mobigen_url and mobigen_user_id and mobigen_password: - print 'settings ok' - # using verify=False since at the moment, using a self-signed SSL cert. - - payload = requests.get(input_url).content - - headers = {'Content-Type': input_format} - r = requests.post(mobigen_url, auth=(mobigen_user_id, mobigen_password), - data=payload, headers=headers) - - # if HTTP reponse code is ok, the output is the mobi file; else error message - if r.status_code == 200: - return r.content - else: - print "{0}: {1}".format(r.status_code, r.content) - raise Exception("{0}: {1}".format(r.status_code, r.content)) - diff --git a/core/mobigen.py b/core/mobigen.py deleted file mode 100644 index e3a23fe42..000000000 --- a/core/mobigen.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Utilities for calling mobigen for management. do not use in application - -""" - -from itertools import islice -from StringIO import StringIO -import uuid - -from django.core.files.storage import default_storage -from django.core.files.base import ContentFile, File - -from regluit.core.models import (Campaign, Ebook) -from regluit.core import parameters -from regluit.core.mobi import convert_to_mobi - - -# compute whether we can apply mobigen to a given edition to produce a mobi file -# need to have an ebook in epub or pdf format -# possible return values: already has a mobi file / can generate a mobi file / not possible - -def edition_mobi_status(edition): - """ - for a given edition, return: - * 1 if there is already a mobi ebook - * 0 if there is none but we have an epub or html to convert from - * -1 for no epub/html to convert from - """ - formats = set([ebook.format for ebook in edition.work.ebooks()]) - if 'mobi' in formats: - return 1 - elif ('epub' in formats) or ('html' in formats): - return 0 - else: - return -1 - - -def write_file_to_storage(file_object, content_type, path): - """ - write file_object to the default_storage at given path - """ - file_s3 = ContentFile(file_object) - file_s3.content_type = content_type - - default_storage.save(path, file_s3) - return file_s3 - - -# generator for editions to add mobi to -# campaigns that can have mobi files but don't yet. - -def editions_to_convert(): - for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct(): - # need to make sure campaign type is not B2U because kindlegen is for books we give awy free of charge - if (edition_mobi_status(campaign.edition) == 0) and (campaign.type != parameters.BUY2UNGLUE): # possible to generate mobi - yield campaign.edition - - -def generate_mobi_ebook_for_edition(edition): - - # pull out the sister edition to convert from - sister_ebook = edition.ebooks.filter(format__in=['epub', 'html'])[0] - - # run the conversion process - - output = convert_to_mobi(sister_ebook.url) - #output = open("/Users/raymondyee/Downloads/hello.mobi").read() - - file_ = write_file_to_storage(output, - "application/x-mobipocket-ebook", - "/ebf/{0}.mobi".format(uuid.uuid4().get_hex())) - - # create a path for the ebookfile: IS THIS NECESSARY? - # https://github.com/Gluejar/regluit/blob/25dcb06f464dc11b5e589ab6859dfcc487f8f3ef/core/models.py#L1771 - - #ebfile = EbookFile(edition=edition, file=file_, format='mobi') - #ebfile.save() - - # maybe need to create an ebook pointing to ebookFile ? - # copy metadata from sister ebook - - ebfile_url = default_storage.url(file_.name) - #print (ebfile_url) - - ebook = Ebook(url=ebfile_url, - format="mobi", - provider="Unglue.it", - rights=sister_ebook.rights, - edition=edition) - ebook.save() - - return ebook diff --git a/core/models/__init__.py b/core/models/__init__.py index 866f0666e..ef9142fb5 100755 --- a/core/models/__init__.py +++ b/core/models/__init__.py @@ -4,11 +4,11 @@ import math import random import re -import urllib -import urllib2 +from urllib.parse import urlencode, quote_plus +from urllib.request import urlopen from datetime import timedelta, datetime from decimal import Decimal -from tempfile import SpooledTemporaryFile +from tempfile import TemporaryFile import requests from ckeditor.fields import RichTextField @@ -22,11 +22,11 @@ from django.contrib.auth.models import User from django.contrib.sites.models import Site from django.contrib.contenttypes.fields import GenericRelation -from django.core.urlresolvers import reverse from django.core.files.base import ContentFile -from django.db import models +from django.db import models, IntegrityError from django.db.models import F, Q from django.db.models.signals import post_save +from django.urls import reverse from django.utils.timezone import now from django.utils.translation import ugettext_lazy as _ @@ -47,8 +47,6 @@ TRANSACTION_STATUS_FAILED, TRANSACTION_STATUS_INCOMPLETE ) - -from regluit.utils import encryption as crypto from regluit.utils.localdatetime import date_today from regluit.core.parameters import ( @@ -63,10 +61,10 @@ THANKED, OFFER_CHOICES, ACQ_CHOICES, + GOOD_PROVIDERS, ) -from regluit.core.epub import personalize, ungluify, ask_epub +from regluit.core.epub import personalize, ask_epub from regluit.core.pdf import ask_pdf, pdf_append -from regluit.core import mobi from regluit.core.signals import ( successful_campaign, unsuccessful_campaign, @@ -81,7 +79,6 @@ EbookFile, Edition, EditionNote, - good_providers, Identifier, path_for_file, Publisher, @@ -104,32 +101,16 @@ class UnglueitError(RuntimeError): pass -class Key(models.Model): - """an encrypted key store""" - name = models.CharField(max_length=255, unique=True) - encrypted_value = models.TextField(null=True, blank=True) - - def _get_value(self): - return crypto.decrypt_string(binascii.a2b_hex(self.encrypted_value), settings.SECRET_KEY) - - def _set_value(self, value): - self.encrypted_value = binascii.b2a_hex(crypto.encrypt_string(value, settings.SECRET_KEY)) - - value = property(_get_value, _set_value) - - def __unicode__(self): - return "Key with name {0}".format(self.name) - class CeleryTask(models.Model): created = models.DateTimeField(auto_now_add=True) task_id = models.CharField(max_length=255) - user = models.ForeignKey(settings.AUTH_USER_MODEL, related_name="tasks", null=True) + user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tasks", null=True) description = models.CharField(max_length=2048, null=True) # a description of what the task is function_name = models.CharField(max_length=1024) # used to reconstitute the AsyncTask with which to get status function_args = models.IntegerField(null=True) # not full generalized here -- takes only a single arg for now. active = models.NullBooleanField(default=True) - def __unicode__(self): + def __str__(self): return "Task %s arg:%s ID# %s %s: State %s " % (self.function_name, self.function_args, self.task_id, self.description, self.state) @property @@ -158,7 +139,7 @@ class Premium(models.Model): TIERS = {"supporter":25, "patron":50, "bibliophile":100} #should load this from fixture created = models.DateTimeField(auto_now_add=True) type = models.CharField(max_length=2, choices=PREMIUM_TYPES) - campaign = models.ForeignKey("Campaign", related_name="premiums", null=True) + campaign = models.ForeignKey("Campaign", on_delete=models.CASCADE, related_name="premiums", null=True) amount = models.DecimalField(max_digits=10, decimal_places=0, blank=False) description = models.TextField(null=True, blank=False) limit = models.IntegerField(default=0) @@ -171,7 +152,7 @@ def premium_count(self): def premium_remaining(self): t_model = apps.get_model('payment', 'Transaction') return self.limit - t_model.objects.filter(premium=self).count() - def __unicode__(self): + def __str__(self): return (self.campaign.work.title if self.campaign else '') + ' $' + str(self.amount) class PledgeExtra: @@ -190,10 +171,10 @@ class CampaignAction(models.Model): # anticipated types: activated, withdrawn, suspended, restarted, succeeded, failed, unglued type = models.CharField(max_length=15) comment = models.TextField(null=True, blank=True) - campaign = models.ForeignKey("Campaign", related_name="actions", null=False) + campaign = models.ForeignKey("Campaign", on_delete=models.CASCADE, related_name="actions", null=False) class Offer(models.Model): - work = models.ForeignKey("Work", related_name="offers", null=False) + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name="offers", null=False) price = models.DecimalField(max_digits=6, decimal_places=2, null=True, blank=False) license = models.PositiveSmallIntegerField(null=False, default=INDIVIDUAL, choices=OFFER_CHOICES) @@ -219,26 +200,26 @@ class Acq(models.Model): expires = models.DateTimeField(null=True) refreshes = models.DateTimeField(auto_now_add=True) refreshed = models.BooleanField(default=True) - work = models.ForeignKey("Work", related_name='acqs', null=False) - user = models.ForeignKey(settings.AUTH_USER_MODEL, related_name='acqs') + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name='acqs', null=False) + user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='acqs') license = models.PositiveSmallIntegerField(null=False, default=INDIVIDUAL, choices=ACQ_CHOICES) - watermarked = models.ForeignKey("booxtream.Boox", null=True) + watermarked = models.ForeignKey("booxtream.Boox", on_delete=models.CASCADE, null=True) nonce = models.CharField(max_length=32, null=True) # when the acq is a loan, this points at the library's acq it's derived from - lib_acq = models.ForeignKey("self", related_name="loans", null=True) + lib_acq = models.ForeignKey("self", on_delete=models.CASCADE, related_name="loans", null=True) class mock_ebook(object): def __init__(self, acq): - self.url = acq.get_mobi_url() - self.format = 'mobi' + self.url = acq.get_epub_url() + self.format = 'epub' self.filesize = 0 def save(self): return True def get_archive(self): try: - r = urllib2.urlopen(self.url) + r = urlopen(self.url) try: self.filesize = int(r.info().getheaders("Content-Length")[0]) except IndexError: @@ -252,7 +233,7 @@ def get_archive(self): def ebook(self): return self.mock_ebook(self) - def __unicode__(self): + def __str__(self): if self.lib_acq: return "%s, %s: %s for %s" % (self.work.title, self.get_license_display(), self.lib_acq.user, self.user) else: @@ -265,10 +246,6 @@ def expired(self): else: return self.expires < datetime.now() - def get_mobi_url(self): - if self.expired: - return '' - return self.get_watermarked().download_link_mobi def get_epub_url(self): if self.expired: @@ -300,7 +277,7 @@ def get_watermarked(self): return self.watermarked def _hash(self): - return hashlib.md5('%s:%s:%s:%s'%(settings.SOCIAL_AUTH_TWITTER_SECRET, self.user_id, self.work_id, self.created)).hexdigest() + return hashlib.md5(bytes('%s:%s:%s'%(self.user_id, self.work_id, self.created), 'utf-8')).hexdigest() def expire_in(self, delta): self.expires = (now() + delta) if delta else now() @@ -326,7 +303,7 @@ def borrow(self, user=None): borrowed = Acq.objects.create(user=user, work=self.work, license=BORROWED, lib_acq=self) from regluit.core.tasks import watermark_acq notification.send([user], "library_borrow", {'acq':borrowed}) - watermark_acq.delay(borrowed) + watermark_acq.delay(borrowed.id) result = borrowed from regluit.core.tasks import emit_notifications emit_notifications.delay() @@ -359,11 +336,11 @@ def config_acq(sender, instance, created, **kwargs): class Hold(models.Model): created = models.DateTimeField(auto_now_add=True) - work = models.ForeignKey("Work", related_name='holds', null=False) - user = models.ForeignKey(settings.AUTH_USER_MODEL, related_name='holds', null=False) - library = models.ForeignKey(Library, related_name='holds', null=False) + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name='holds', null=False) + user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='holds', null=False) + library = models.ForeignKey(Library, on_delete=models.CASCADE, related_name='holds', null=False) - def __unicode__(self): + def __str__(self): return '%s for %s at %s' % (self.work, self.user.username, self.library) def ahead(self): return Hold.objects.filter(work=self.work, library=self.library, created__lt=self.created).count() @@ -391,8 +368,8 @@ class Campaign(models.Model): activated = models.DateTimeField(null=True, db_index=True,) paypal_receiver = models.CharField(max_length=100, blank=True) amazon_receiver = models.CharField(max_length=100, blank=True) - work = models.ForeignKey("Work", related_name="campaigns", null=False) - managers = models.ManyToManyField(settings.AUTH_USER_MODEL, related_name="campaigns", null=False) + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name="campaigns", null=False) + managers = models.ManyToManyField(settings.AUTH_USER_MODEL, related_name="campaigns") # status: INITIALIZED, ACTIVE, SUSPENDED, WITHDRAWN, SUCCESSFUL, UNSUCCESSFUL status = models.CharField( max_length=15, null=True, blank=False, default="INITIALIZED", db_index=True, choices=STATUS_CHOICES) @@ -401,9 +378,9 @@ class Campaign(models.Model): (BUY2UNGLUE, 'Buy-to-unglue campaign'), (THANKS, 'Thanks-for-ungluing campaign'), )) - edition = models.ForeignKey("Edition", related_name="campaigns", null=True) + edition = models.ForeignKey("Edition", on_delete=models.CASCADE, related_name="campaigns", null=True) email = models.CharField(max_length=100, blank=True) - publisher = models.ForeignKey("Publisher", related_name="campaigns", null=True) + publisher = models.ForeignKey("Publisher", on_delete=models.CASCADE, related_name="campaigns", null=True) do_watermark = models.BooleanField(default=True) use_add_ask = models.BooleanField(default=True) charitable = models.BooleanField(default=False) @@ -412,7 +389,7 @@ def __init__(self, *args, **kwargs): self.problems = [] super(Campaign, self).__init__(*args, **kwargs) - def __unicode__(self): + def __str__(self): try: return u"Campaign for %s" % self.work.title except: @@ -443,7 +420,7 @@ def clone(self): self.activated = None self.update_left() self.save() - self.managers = old_managers + self.managers.set(old_managers) # clone associated premiums for premium in new_premiums: @@ -492,10 +469,10 @@ def launchable(self): self.problems.append(_('A campaign must have a target')) may_launch = False if self.type == BUY2UNGLUE: - if self.work.offers.filter(price__gt=0, active=True).count() == 0: + if not self.work.offers.filter(price__gt=0, active=True).exists(): self.problems.append(_('You can\'t launch a buy-to-unglue campaign before setting a price for your ebooks')) may_launch = False - if EbookFile.objects.filter(edition__work=self.work).count() == 0: + if not EbookFile.objects.filter(edition__work=self.work).exists(): self.problems.append(_('You can\'t launch a buy-to-unglue campaign if you don\'t have any ebook files uploaded')) may_launch = False if (self.cc_date_initial is None) or (self.cc_date_initial > datetime.combine(settings.MAX_CC_DATE, datetime.min.time())) or (self.cc_date_initial < now()): @@ -510,7 +487,7 @@ def launchable(self): may_launch = False if self.type == THANKS: # the case in which there is no EbookFile and no Ebook associated with work (We have ebooks without ebook files.) - if EbookFile.objects.filter(edition__work=self.work).count() == 0 and self.work.ebooks().count() == 0: + if not EbookFile.objects.filter(edition__work=self.work).exists() and not self.work.ebooks().exists(): self.problems.append(_('You can\'t launch a thanks-for-ungluing campaign if you don\'t have any ebook files uploaded')) may_launch = False except Exception as e: @@ -568,7 +545,7 @@ def update_status(self, ignore_deadline_for_success=False, send_notice=False, pr self.save() action = CampaignAction(campaign=self, type='succeeded', comment=self.current_total) action.save() - self.watermark_success() + # B2U watermark_success() removed (#1093); entire B2U branch is dead code (#1081) if send_notice: successful_campaign.send(sender=None, campaign=self) @@ -640,10 +617,10 @@ def activate(self): raise UnglueitError(_('Campaign needs to be initialized in order to be activated')) try: active_claim = self.work.claim.filter(status="active")[0] - except IndexError, e: + except IndexError as e: raise UnglueitError(_('Campaign needs to have an active claim in order to be activated')) if not self.launchable: - raise UnglueitError('Configuration issues need to be addressed before campaign is activated: %s' % unicode(self.problems[0])) + raise UnglueitError('Configuration issues need to be addressed before campaign is activated: %s' % str(self.problems[0])) self.status = 'ACTIVE' self.left = self.target self.activated = datetime.today() @@ -721,16 +698,13 @@ def transaction_to_recharge(self, user): # only if a campaign is SUCCESSFUL, we allow for recharged if self.status == 'SUCCESSFUL': - if self.transaction_set.filter(Q(user=user) & (Q(status=TRANSACTION_STATUS_COMPLETE) | Q(status=TRANSACTION_STATUS_ACTIVE))).count(): + if self.transaction_set.filter(Q(user=user) & (Q(status=TRANSACTION_STATUS_COMPLETE) | Q(status=TRANSACTION_STATUS_ACTIVE))).exists(): # presence of an active or complete transaction means no transaction to recharge return None else: transactions = self.transaction_set.filter(Q(user=user) & (Q(status=TRANSACTION_STATUS_ERROR) | Q(status=TRANSACTION_STATUS_FAILED))) # assumption --that the first failed/errored transaction has the amount we need to recharge - if transactions.count(): - return transactions[0] - else: - return None + return transactions.first() else: return None @@ -896,9 +870,9 @@ def countdown(self): if time_remaining.days: countdown = "%s days" % str(time_remaining.days + 1) elif time_remaining.seconds > 3600: - countdown = "%s hours" % str(time_remaining.seconds/3600 + 1) + countdown = "%s hours" % str(time_remaining.seconds // 3600 + 1) elif time_remaining.seconds > 60: - countdown = "%s minutes" % str(time_remaining.seconds/60 + 1) + countdown = "%s minutes" % str(time_remaining.seconds // 60 + 1) else: countdown = "Seconds" @@ -908,17 +882,6 @@ def countdown(self): def latest_ending(cls): return timedelta(days=int(settings.UNGLUEIT_LONGEST_DEADLINE)) + now() - def make_mobis(self): - # make archive files for ebooks, make mobi files for epubs - versions = set() - for ebook in self.work.ebooks().filter(provider__in=good_providers, format='mobi'): - versions.add(ebook.version_label) - for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers, format='epub'): - if not ebook.version_label in versions: - # now make the mobi file - ebf = ebook.get_archive_ebf() - ebf.make_mobi() - def add_ask_to_ebfs(self, position=0): if not self.use_add_ask or self.type != THANKS: return @@ -930,7 +893,7 @@ def add_ask_to_ebfs(self, position=0): ebf.file.open() to_dos.append({'content': ebf.file.read(), 'ebook': ebf.ebook}) format_versions.append(format_version) - for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers): + for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS): format_version = '{}_{}'.format(ebook.format, ebook.version_label) if ebook.format in ('pdf', 'epub') and not format_version in format_versions: to_dos.append({'content': ebook.get_archive().read(), 'ebook': ebook}) @@ -942,8 +905,8 @@ def add_ask_to_ebfs(self, position=0): if to_do['ebook'].format == 'pdf': try: added = ask_pdf({'campaign':self, 'work':self.work, 'site':Site.objects.get_current()}) - new_file = SpooledTemporaryFile() - old_file = SpooledTemporaryFile() + new_file = TemporaryFile() + old_file = TemporaryFile() old_file.write(to_do['content']) if position == 0: pdf_append(added, old_file, new_file) @@ -959,7 +922,7 @@ def add_ask_to_ebfs(self, position=0): logger.error("error appending pdf ask %s" % (e)) elif to_do['ebook'].format == 'epub': try: - old_file = SpooledTemporaryFile() + old_file = TemporaryFile() old_file.write(to_do['content']) new_file = ask_epub(old_file, {'campaign':self, 'work':self.work, 'site':Site.objects.get_current()}) new_file.seek(0) @@ -969,17 +932,6 @@ def add_ask_to_ebfs(self, position=0): new_epub_ebf.version = version new_ebfs.append(new_epub_ebf) - # now make the mobi file - new_mobi_ebf = EbookFile.objects.create(edition=edition, format='mobi', asking=True) - try: - new_mobi_file = ContentFile(mobi.convert_to_mobi(new_epub_ebf.file.url)) - except Exception as e: - logger.error("error making mobi for %s" % (new_epub_ebf.file.url)) - raise e - new_mobi_ebf.file.save(path_for_file('ebf', None), new_mobi_file) - new_mobi_ebf.save() - new_mobi_ebf.version = version - new_ebfs.append(new_mobi_ebf) except Exception as e: logger.error("error making epub ask or mobi %s" % (e)) for ebf in new_ebfs: @@ -1023,54 +975,6 @@ def revert_asks(self): ebf.ebook.activate() format_versions.append(format_version) - def make_unglued_ebf(self, format, watermarked): - r = urllib2.urlopen(watermarked.download_link(format)) - ebf = EbookFile.objects.create(edition=self.work.preferred_edition, format=format) - ebf.file.save(path_for_file(ebf, None), ContentFile(r.read())) - ebf.file.close() - ebf.save() - ebook = Ebook.objects.create( - edition=self.work.preferred_edition, - format=format, - rights=self.license, - provider="Unglue.it", - url=settings.BASE_URL_SECURE + reverse('download_campaign', args=[self.work_id, format]), - version='unglued', - ) - old_ebooks = Ebook.objects.exclude(pk=ebook.pk).filter( - edition=self.work.preferred_edition, - format=format, - rights=self.license, - provider="Unglue.it", - ) - for old_ebook in old_ebooks: - old_ebook.deactivate() - return ebook.pk - - - def watermark_success(self): - if self.status == 'SUCCESSFUL' and self.type == BUY2UNGLUE: - params = { - 'customeremailaddress': self.license, - 'customername': 'The Public', - 'languagecode':'1033', - 'expirydays': 1, - 'downloadlimit': 7, - 'exlibris':0, - 'chapterfooter':0, - 'disclaimer':0, - 'referenceid': '%s:%s:%s' % (self.work_id, self.id, self.license), - 'kf8mobi': True, - 'epub': True, - } - ungluified = ungluify(self.work.epubfiles()[0].file, self) - ungluified.filename.seek(0) - watermarked = watermarker.platform(epubfile=ungluified.filename, **params) - self.make_unglued_ebf('epub', watermarked) - self.make_unglued_ebf('mobi', watermarked) - return True - return False - def is_pledge(self): return self.type == REWARDS @@ -1085,17 +989,21 @@ def marc_records(self): class Wishlist(models.Model): created = models.DateTimeField(auto_now_add=True) - user = models.OneToOneField(settings.AUTH_USER_MODEL, related_name='wishlist') + user = models.OneToOneField(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='wishlist') works = models.ManyToManyField('Work', related_name='wishlists', through='Wishes') - def __unicode__(self): + def __str__(self): return "%s's Books" % self.user.username def add_work(self, work, source, notify=False): try: w = Wishes.objects.get(wishlist=self, work=work) - except: - Wishes.objects.create(source=source, wishlist=self, work=work) + except Wishes.DoesNotExist: + try: + Wishes.objects.create(source=source, wishlist=self, work=work) + except IntegrityError: + # threading issue? + return work.update_num_wishes() # only send notification in case of new wishes # and only when they result from user action, not (e.g.) our tests @@ -1118,8 +1026,8 @@ def work_source(self, work): class Wishes(models.Model): created = models.DateTimeField(auto_now_add=True, db_index=True,) source = models.CharField(max_length=15, blank=True, db_index=True,) - wishlist = models.ForeignKey('Wishlist') - work = models.ForeignKey('Work', related_name='wishes') + wishlist = models.ForeignKey('Wishlist', on_delete=models.CASCADE) + work = models.ForeignKey('Work', on_delete=models.CASCADE, related_name='wishes') class Meta: db_table = 'core_wishlist_works' @@ -1130,7 +1038,7 @@ class Badge(models.Model): @property def path(self): return '/static/images/%s.png' % self.name - def __unicode__(self): + def __str__(self): return self.name def pledger(): @@ -1146,10 +1054,10 @@ def pledger2(): pledger2.instance = None ANONYMOUS_AVATAR = '/static/images/header/avatar.png' -(NO_AVATAR, GRAVATAR, TWITTER, FACEBOOK, UNGLUEITAR) = AVATARS +(NO_AVATAR, GRAVATAR, TWITTER, UNGLUEITAR) = AVATARS class Libpref(models.Model): - user = models.OneToOneField(settings.AUTH_USER_MODEL, related_name='libpref') + user = models.OneToOneField(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='libpref') marc_link_target = models.CharField( max_length=6, default='UNGLUE', @@ -1159,12 +1067,10 @@ class Libpref(models.Model): class UserProfile(models.Model): created = models.DateTimeField(auto_now_add=True) - user = models.OneToOneField(settings.AUTH_USER_MODEL, related_name='profile') + user = models.OneToOneField(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='profile') tagline = models.CharField(max_length=140, blank=True) pic_url = models.URLField(blank=True) home_url = models.URLField(blank=True) - twitter_id = models.CharField(max_length=15, blank=True) - facebook_id = models.BigIntegerField(null=True, blank=True) librarything_id = models.CharField(max_length=31, blank=True) badges = models.ManyToManyField('Badge', related_name='holders', blank=True) kindle_email = models.EmailField(max_length=254, blank=True) @@ -1172,25 +1078,17 @@ class UserProfile(models.Model): # keep track of work the user adds works = models.ManyToManyField('Work', related_name='contributors', blank=True) - goodreads_user_id = models.CharField(max_length=32, null=True, blank=True) - goodreads_user_name = models.CharField(max_length=200, null=True, blank=True) - goodreads_auth_token = models.TextField(null=True, blank=True) - goodreads_auth_secret = models.TextField(null=True, blank=True) - goodreads_user_link = models.CharField(max_length=200, null=True, blank=True) - avatar_source = models.PositiveSmallIntegerField( null=True, default=UNGLUEITAR, choices=( (NO_AVATAR, 'No Avatar, Please'), (GRAVATAR, 'Gravatar'), - (TWITTER, 'Twitter'), - (FACEBOOK, 'Facebook'), (UNGLUEITAR, 'Unglueitar'), ) ) - def __unicode__(self): + def __str__(self): return self.user.username def reset_pledge_badge(self): @@ -1206,24 +1104,19 @@ def reset_pledge_badge(self): @property def pledge_count(self): - return self.user.transaction_set.exclude(status='NONE').exclude(status='Canceled', reason=None).exclude(anonymous=True).count() + return self.user.transaction_set.exclude(status='NONE').exclude( + status='Canceled', reason=None).exclude(anonymous=True).count() @property def account(self): # there should be only one active account per user - accounts = self.user.account_set.filter(date_deactivated__isnull=True) - if accounts.count() == 0: - return None - else: - return accounts[0] + return self.user.account_set.filter(date_deactivated__isnull=True).first() @property def old_account(self): - accounts = self.user.account_set.filter(date_deactivated__isnull=False).order_by('-date_deactivated') - if accounts.count() == 0: - return None - else: - return accounts[0] + return self.user.account_set.filter( + date_deactivated__isnull=False + ).order_by('-date_deactivated').first() @property def pledges(self): @@ -1232,10 +1125,7 @@ def pledges(self): @property def last_transaction(self): from regluit.payment.models import Transaction - try: - return Transaction.objects.filter(user=self.user).order_by('-date_modified')[0] - except IndexError: - return None + return Transaction.objects.filter(user=self.user).order_by('-date_modified').first() @property def ack_name(self): @@ -1252,8 +1142,7 @@ def anon_pref(self): last = self.last_transaction if last: return last.anonymous - else: - return None + return None @property def on_ml(self): @@ -1267,12 +1156,12 @@ def on_ml(self): ) if member['status'] == 'subscribed': return 'True' - except MailChimpError, e: - if e[0]['status'] != 404: # don't log case where user is not on a list + except MailChimpError as e: + if e.args[0]['status'] != 404: # don't log case where user is not on a list logger.error("error getting mailchimp status %s" % (e)) - except ValueError, e: + except ValueError as e: logger.error("bad email address %s" % (self.user.email)) - except Exception, e: + except Exception as e: logger.error("error getting mailchimp status %s" % (e)) return False @@ -1281,7 +1170,7 @@ def ml_subscribe(self, **kwargs): # use @example.org email addresses for testing! return from regluit.core.tasks import ml_subscribe_task - ml_subscribe_task.delay(self, **kwargs) + ml_subscribe_task.delay(self.id, **kwargs) def ml_unsubscribe(self): if "@example.org" in self.user.email: @@ -1293,29 +1182,29 @@ def ml_unsubscribe(self): subscriber_hash=self.user.email, ) return True - except MailChimpError, e: - if e[0]['status'] != 404: # don't log case where user is not on a list + except MailChimpError as e: + if e.args[0]['status'] != 404: # don't log case where user is not on a list logger.error("error getting mailchimp status %s" % (e)) - except Exception, e: + except Exception as e: logger.error("error unsubscribing from mailchimp list %s" % (e)) return False def gravatar(self): # construct the url - gravatar_url = "https://www.gravatar.com/avatar/" + hashlib.md5(self.user.email.lower()).hexdigest() + "?" - gravatar_url += urllib.urlencode({'d':'wavatar', 's':'50'}) + gravatar_url = "https://www.gravatar.com/avatar/" + hashlib.md5(bytes(self.user.email.lower(), 'utf-8')).hexdigest() + "?" + gravatar_url += urlencode({'d':'wavatar', 's':'50'}) return gravatar_url def unglueitar(self): # construct the url - gravatar_url = "https://www.gravatar.com/avatar/" + hashlib.md5(urllib.quote_plus(self.user.username.encode('utf-8')) + '@unglue.it').hexdigest() + "?" - gravatar_url += urllib.urlencode({'d':'wavatar', 's':'50'}) + gravatar_url = "https://www.gravatar.com/avatar/" + hashlib.md5(bytes(quote_plus(self.user.username), 'utf-8') + b'@unglue.it').hexdigest() + "?" + gravatar_url += urlencode({'d':'wavatar', 's':'50'}) return gravatar_url @property def avatar_url(self): - if self.avatar_source is None or self.avatar_source is TWITTER: + if self.avatar_source is None: if self.pic_url: return self.pic_url else: @@ -1324,10 +1213,7 @@ def avatar_url(self): return self.unglueitar() elif self.avatar_source == GRAVATAR: return self.gravatar() - elif self.avatar_source == FACEBOOK and self.facebook_id != None: - return 'https://graph.facebook.com/v2.3/' + str(self.facebook_id) + '/picture?redirect=true' - else: - return ANONYMOUS_AVATAR + return ANONYMOUS_AVATAR @property def social_auths(self): @@ -1359,9 +1245,9 @@ class Press(models.Model): class Gift(models.Model): # the acq will contain the recipient, and the work - acq = models.ForeignKey('Acq', related_name='gifts') + acq = models.ForeignKey('Acq', on_delete=models.CASCADE, related_name='gifts') to = models.CharField(max_length=75, blank=True) # store the email address originally sent to, not necessarily the email of the recipient - giver = models.ForeignKey(User, related_name='gifts') + giver = models.ForeignKey(User, on_delete=models.CASCADE, related_name='gifts') message = models.TextField(max_length=512, default='') used = models.DateTimeField(null=True) diff --git a/core/models/bibmodels.py b/core/models/bibmodels.py index f8b0af23b..954a472d3 100644 --- a/core/models/bibmodels.py +++ b/core/models/bibmodels.py @@ -1,39 +1,42 @@ import logging import math import re -import urllib -import urllib2 +import unicodedata import uuid from decimal import Decimal -import unicodedata -from urlparse import urlparse +from ssl import CertificateError +from urllib.parse import urlparse + +import requests -from sorl.thumbnail import get_thumbnail +from botocore.exceptions import ClientError from PIL import ImageFile + from django.conf import settings from django.contrib.auth.models import User from django.contrib.contenttypes.fields import GenericRelation from django.core.files.base import ContentFile -from django.core.urlresolvers import reverse +from django.urls import reverse from django.db import models from django.db.models import F -from django.db.models.signals import post_save, pre_delete +from django.db.models.signals import m2m_changed, post_save, pre_delete from django.utils.timezone import now from django_comments.models import Comment import regluit from regluit.marc.models import MARCRecord as NewMARC -from questionnaire.models import Landing -from regluit.core import mobi +from regluit.bisac.models import interpret_notation import regluit.core.cc as cc +from regluit.core.covers import (get_thumbnail, + DEFAULT_COVER, DEFAULT_COVER_LARGE, DEFAULT_COVER_SMALL) from regluit.core.epub import test_epub from regluit.core.links import id_url +from regluit.core.loaders.harvest import dl_online from regluit.core.validation import valid_subject - from regluit.core.parameters import ( AGE_LEVEL_CHOICES, BORROWED, @@ -47,13 +50,14 @@ THANKED, THANKS, WORK_IDENTIFIERS, + DOMAIN_TO_PROVIDER, ) # fix truncated file problems per https://stackoverflow.com/questions/12984426/python-pil-ioerror-image-file-truncated-with-big-images ImageFile.LOAD_TRUNCATED_IMAGES = True logger = logging.getLogger(__name__) -good_providers = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library') +dllogger = logging.getLogger('regluit.downloads') def id_for(obj, type): if not obj.pk: @@ -67,8 +71,8 @@ class Identifier(models.Model): # olib, ltwk, goog, gdrd, thng, isbn, oclc, olwk, doab, gtbg, glue, doi type = models.CharField(max_length=4, null=False) value = models.CharField(max_length=250, null=False) - work = models.ForeignKey("Work", related_name="identifiers", null=False) - edition = models.ForeignKey("Edition", related_name="identifiers", null=True) + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name="identifiers", null=False) + edition = models.ForeignKey("Edition", on_delete=models.CASCADE, related_name="identifiers", null=True) class Meta: unique_together = ("type", "value") @@ -92,7 +96,7 @@ def set(type=None, value=None, edition=None, work=None): identifier.edition = edition identifier.save() others = Identifier.objects.filter(type=type, work=work, edition=edition).exclude(value=value) - if others.count() > 0: + if others.exists(): for other in others: other.delete() return identifier @@ -106,12 +110,12 @@ def get_or_add(type='goog', value=None, edition=None, work=None): i.save() return i - def __unicode__(self): + def __str__(self): return u'{0}:{1}'.format(self.type, self.value) - + def label(self): return ID_CHOICES_MAP.get(self.type, self.type) - + def url(self): return id_url(self.type, self.value) @@ -122,19 +126,21 @@ class Work(models.Model): openlibrary_lookup = models.DateTimeField(null=True, blank=True) num_wishes = models.IntegerField(default=0, db_index=True) description = models.TextField(default='', null=True, blank=True) - selected_edition = models.ForeignKey("Edition", related_name='selected_works', null=True) + selected_edition = models.ForeignKey("Edition", on_delete=models.CASCADE, related_name='selected_works', null=True) # repurposed earliest_publication to actually be publication range publication_range = models.CharField(max_length=50, null=True, blank=True) featured = models.DateTimeField(null=True, blank=True, db_index=True,) - is_free = models.BooleanField(default=False) - landings = GenericRelation(Landing, related_query_name='works') + is_free = models.BooleanField(default=False, db_index=True) related = models.ManyToManyField('self', symmetrical=False, blank=True, through='WorkRelation', related_name='reverse_related') - age_level = models.CharField(max_length=5, choices=AGE_LEVEL_CHOICES, default='', blank=True) + age_level = models.CharField(max_length=5, choices=AGE_LEVEL_CHOICES, default='', blank=True) class Meta: ordering = ['title'] - - def __unicode__(self): + indexes = [ + models.Index(fields=['is_free', 'title']), + ] + + def __str__(self): return self.title def __init__(self, *args, **kwargs): @@ -144,7 +150,7 @@ def __init__(self, *args, **kwargs): def delete(self, cascade=True, *args, **kwargs): if cascade: if self.offers.all() or self.claim.all() or self.campaigns.all() or self.acqs.all() \ - or self.holds.all() or self.landings.all(): + or self.holds.all(): return for wishlist in self.wishlists.all(): wishlist.remove_work(self) @@ -165,9 +171,9 @@ def delete(self, cascade=True, *args, **kwargs): for work_relation in self.works_related_from.all(): work_relation.delete() super(Work, self).delete(*args, **kwargs) # Call the "real" save() method. - + def id_for(self, type): - return id_for(self, type) + return id_for(self, type) # this is NOT recursive! @property def gtbg(self): @@ -233,7 +239,7 @@ def openlibrary_id(self): @property def openlibrary_url(self): return id_url('olwk', self.openlibrary_id) - + def cover_filetype(self): if self.uses_google_cover(): return 'jpeg' @@ -262,12 +268,12 @@ def uses_google_cover(self): def cover_image_large(self): if self.preferred_edition and self.preferred_edition.has_cover_image(): return self.preferred_edition.cover_image_large() - return "/static/images/generic_cover_larger.png" + return DEFAULT_COVER_LARGE def cover_image_small(self): if self.preferred_edition and self.preferred_edition.has_cover_image(): return self.preferred_edition.cover_image_small() - return "/static/images/generic_cover_larger.png" + return DEFAULT_COVER_SMALL def cover_image_thumbnail(self): try: @@ -275,29 +281,29 @@ def cover_image_thumbnail(self): return self.preferred_edition.cover_image_thumbnail() except IndexError: pass - return "/static/images/generic_cover_larger.png" + return DEFAULT_COVER def authors(self): # assumes that they come out in the same order they go in! - if self.preferred_edition and self.preferred_edition.authors.all().count() > 0: + if self.preferred_edition and self.preferred_edition.authors.exists(): return self.preferred_edition.authors.all() for edition in self.editions.all(): - if edition.authors.all().count() > 0: + if edition.authors.exists(): return edition.authors.all() return Author.objects.none() def relators(self): # assumes that they come out in the same order they go in! - if self.preferred_edition and self.preferred_edition.relators.all().count() > 0: + if self.preferred_edition and self.preferred_edition.relators.exists(): return self.preferred_edition.relators.all() for edition in self.editions.all(): - if edition.relators.all().count() > 0: + if edition.relators.exists(): return edition.relators.all() return Relator.objects.none() def author(self): # assumes that they come out in the same order they go in! - if self.relators().count() > 0: + if self.relators().exists(): return self.relators()[0].name return '' @@ -329,7 +335,7 @@ def kindle_safe_title(self): nkfd_form = unicodedata.normalize('NFKD', self.title) #unaccent accented letters for c in nkfd_form: ccat = unicodedata.category(c) - #print ccat + if ccat.startswith('L') or ccat.startswith('N'): # only letters and numbers if ord(c) > 127: safe = safe + '#' #a non latin script letter or number @@ -399,9 +405,9 @@ def percent_unglued(self): status = 6 else: if campaign.type == BUY2UNGLUE: - status = int(6 - 6*campaign.left/campaign.target) + status = int(6 - 6 * campaign.left / campaign.target) else: - status = int(float(campaign.current_total)*6/target) + status = int(float(campaign.current_total) * 6 / target) if status >= 6: status = 6 return status @@ -431,14 +437,14 @@ def mobifiles(self): def pdffiles(self): return EbookFile.objects.filter(edition__work=self, format='pdf').exclude(file='').order_by('-created') - + def versions(self): version_labels = [] for ebook in self.ebooks_all(): if ebook.version_label and not ebook.version_label in version_labels: version_labels.append(ebook.version_label) return version_labels - + def formats(self): fmts = [] for fmt in ['pdf', 'epub', 'mobi', 'html']: @@ -450,7 +456,7 @@ def formats(self): def remove_old_ebooks(self): # this method is triggered after an file upload or new ebook saved old = Ebook.objects.filter(edition__work=self, active=True).order_by('-version_iter', '-created') - + # keep highest version ebook for each format and version label done_format_versions = [] for eb in old: @@ -459,7 +465,7 @@ def remove_old_ebooks(self): eb.deactivate() else: done_format_versions.append(format_version) - + # check for failed uploads. null_files = EbookFile.objects.filter(edition__work=self, file='') for ebf in null_files: @@ -512,11 +518,8 @@ def update_num_wishes(self): self.save() def priority(self): - if self.last_campaign(): - return 5 - freedom = 1 if self.is_free else 0 wishing = int(math.log(self.num_wishes)) + 1 if self.num_wishes else 0 - return min(freedom + wishing, 5) + return min(1 + wishing, 5) def first_oclc(self): if self.preferred_edition is None: @@ -612,7 +615,7 @@ def get_lib_license(self, user): return self.get_user_license(lib_user) def borrowable(self, user): - if user.is_anonymous(): + if user.is_anonymous: return False lib_license = self.get_lib_license(user) if lib_license and lib_license.borrowable: @@ -620,7 +623,7 @@ def borrowable(self, user): return False def lib_thanked(self, user): - if user.is_anonymous(): + if user.is_anonymous: return False lib_license = self.get_lib_license(user) if lib_license and lib_license.thanked: @@ -628,10 +631,10 @@ def lib_thanked(self, user): return False def in_library(self, user): - if user.is_anonymous(): + if user.is_anonymous: return False lib_license = self.get_lib_license(user) - if lib_license and lib_license.acqs.count(): + if lib_license and lib_license.acqs.exists(): return True return False @@ -650,23 +653,15 @@ def __init__(self, acqs): @property def is_active(self): - return self.acqs.filter(expires__isnull=True).count() > 0 or self.acqs.filter(expires__gt=now()).count() > 0 + return self.acqs.filter(expires__isnull=True).exists() or self.acqs.filter(expires__gt=now()).exists() @property def borrowed(self): - loans = self.acqs.filter(license=BORROWED, expires__gt=now()) - if loans.count() == 0: - return None - else: - return loans[0] + return self.acqs.filter(license=BORROWED, expires__gt=now()).first() @property def purchased(self): - purchases = self.acqs.filter(license=INDIVIDUAL, expires__isnull=True) - if purchases.count() == 0: - return None - else: - return purchases[0] + return self.acqs.filter(license=INDIVIDUAL, expires__isnull=True).first() @property def lib_acqs(self): @@ -675,24 +670,20 @@ def lib_acqs(self): @property def next_acq(self): """ This is the next available copy in the user's libraries""" - loans = self.acqs.filter(license=LIBRARY, refreshes__gt=now()).order_by('refreshes') - if loans.count() == 0: - return None - else: - return loans[0] + return self.acqs.filter(license=LIBRARY, + refreshes__gt=now()).order_by('refreshes').first() @property def borrowable(self): - return self.acqs.filter(license=LIBRARY, refreshes__lt=now()).count() > 0 + return self.acqs.filter(license=LIBRARY, refreshes__lt=now()).exists() @property def thanked(self): - return self.acqs.filter(license=THANKED).count() > 0 + return self.acqs.filter(license=THANKED).exists() @property def borrowable_acq(self): - for acq in self.acqs.filter(license=LIBRARY, refreshes__lt=now()): - return acq + return self.acqs.filter(license=LIBRARY, refreshes__lt=now()).first() @property def is_duplicate(self): @@ -706,7 +697,7 @@ def get_user_license(self, user): if user is None: return None if hasattr(user, 'is_anonymous'): - if user.is_anonymous(): + if user.is_anonymous: return None return self.user_license(self.acqs.filter(user=user)) else: @@ -732,8 +723,8 @@ def marc_records(self): return record_list class WorkRelation(models.Model): - to_work = models.ForeignKey('Work', related_name='works_related_to') - from_work= models.ForeignKey('Work', related_name='works_related_from') + to_work = models.ForeignKey('Work', on_delete=models.CASCADE, related_name='works_related_to') + from_work= models.ForeignKey('Work', on_delete=models.CASCADE, related_name='works_related_from') relation = models.CharField(max_length=15, choices=TEXT_RELATION_CHOICES) @@ -742,7 +733,12 @@ class Author(models.Model): name = models.CharField(max_length=255, unique=True) editions = models.ManyToManyField("Edition", related_name="authors", through="Relator") - def __unicode__(self): + class Meta: + indexes = [ + models.Index(fields=['name']), + ] + + def __str__(self): return self.name @property @@ -766,9 +762,9 @@ class Relation(models.Model): name = models.CharField(max_length=30, blank=True,) class Relator(models.Model): - relation = models.ForeignKey('Relation', default=1) #first relation should have code='aut' - author = models.ForeignKey('Author') - edition = models.ForeignKey('Edition', related_name='relators') + relation = models.ForeignKey('Relation', on_delete=models.CASCADE, default=1) #first relation should have code='aut' + author = models.ForeignKey('Author', on_delete=models.CASCADE) + edition = models.ForeignKey('Edition', on_delete=models.CASCADE, related_name='relators') class Meta: db_table = 'core_author_editions' @@ -787,28 +783,34 @@ def set(self, relation_code): except Relation.DoesNotExist: logger.warning("relation not found: code = %s" % relation_code) +AUTHMATCH = re.compile(r'\s*!([a-z]+):?\s+(.*)') + class Subject(models.Model): created = models.DateTimeField(auto_now_add=True) - name = models.CharField(max_length=200, unique=True) + name = models.CharField(max_length=200, unique=True, db_index=True) works = models.ManyToManyField("Work", related_name="subjects") is_visible = models.BooleanField(default=True) authority = models.CharField(max_length=10, blank=False, default="") + num_free = models.IntegerField(default=0) class Meta: ordering = ['name'] + indexes = [ + models.Index(fields=['name']), + ] @classmethod def set_by_name(cls, subject, work=None, authority=None): ''' use this method whenever you would be creating a new subject!''' subject = subject.strip() - + # make sure it's not a ; delineated list subjects = subject.split(';') for additional_subject in subjects[1:]: cls.set_by_name(additional_subject, work, authority) subject = subjects[0] # make sure there's no heading - headingmatch = re.match(r'^!(.+):(.+)', subject) + headingmatch = AUTHMATCH.match(subject) if headingmatch: subject = headingmatch.group(2).strip() authority = headingmatch.group(1).strip() @@ -820,19 +822,20 @@ def set_by_name(cls, subject, work=None, authority=None): subject = subject[6:].split('=')[0].replace('_', ' ').strip().capitalize() subject = 'Award Winner - {}'.format(subject) authority = 'award' - + if authority == 'bisacsh': + subject = interpret_notation(subject) if valid_subject(subject): (subject_obj, created) = cls.objects.get_or_create(name=subject) if not subject_obj.authority and authority: subject_obj.authority = authority - subject_obj.save() - + subject_obj.works.add(work) - return subject_obj + subject_obj.count_free() + return subject_obj else: return None - - def __unicode__(self): + + def __str__(self): return self.name @@ -842,18 +845,29 @@ def kw(self): def free_works(self): return self.works.filter(is_free=True) + + def count_free(self, force=False): + if self.is_visible or force: + self.num_free = self.works.filter(is_free=True).count() + self.save() + class Edition(models.Model): created = models.DateTimeField(auto_now_add=True) title = models.CharField(max_length=1000) - publisher_name = models.ForeignKey("PublisherName", related_name="editions", null=True, blank=True) + publisher_name = models.ForeignKey("PublisherName", on_delete=models.CASCADE, related_name="editions", null=True, blank=True) publication_date = models.CharField(max_length=50, null=True, blank=True, db_index=True) - work = models.ForeignKey("Work", related_name="editions", null=True) + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name="editions", null=True) cover_image = models.URLField(null=True, blank=True) unglued = models.BooleanField(default=False) - note = models.ForeignKey("EditionNote", null=True, blank=True) + note = models.ForeignKey("EditionNote", on_delete=models.CASCADE, null=True, blank=True) + + class Meta: + indexes = [ + models.Index(fields=['work']), + ] - def __unicode__(self): + def __str__(self): if self.isbn_13: return "%s (ISBN %s) %s" % (self.title, self.isbn_13, self.publisher) if self.oclc: @@ -867,42 +881,37 @@ def cover_image_large(self): #550 pixel high image if self.cover_image: im = get_thumbnail(self.cover_image, 'x550', crop='noop', quality=95) - if im.exists(): + if not im.is_default: return im.url - elif self.googlebooks_id: + if self.googlebooks_id: url = "https://encrypted.google.com/books?id=%s&printsec=frontcover&img=1&zoom=0" % self.googlebooks_id im = get_thumbnail(url, 'x550', crop='noop', quality=95) - if not im.exists() or im.storage.size(im.name) == 16392: # check for "image not available" image + if im.is_default or im.storage.size(im.name) == 16392: # check for "image not available" image url = "https://encrypted.google.com/books?id=%s&printsec=frontcover&img=1&zoom=1" % self.googlebooks_id im = get_thumbnail(url, 'x550', crop='noop', quality=95) - if im.exists(): + if not im.is_default: return im.url - else: - return '' - else: - return '' + return DEFAULT_COVER_LARGE def cover_image_small(self): #80 pixel high image if self.cover_image: im = get_thumbnail(self.cover_image, 'x80', crop='noop', quality=95) - if im.exists(): + if not im.is_default: return im.url if self.googlebooks_id: return "https://encrypted.google.com/books?id=%s&printsec=frontcover&img=1&zoom=5" % self.googlebooks_id - else: - return '' + return DEFAULT_COVER_SMALL def cover_image_thumbnail(self): #128 pixel wide image if self.cover_image: im = get_thumbnail(self.cover_image, '128', crop='noop', quality=95) - if im.exists(): + if not im.is_default: return im.url if self.googlebooks_id: return "https://encrypted.google.com/books?id=%s&printsec=frontcover&img=1&zoom=1" % self.googlebooks_id - else: - return '' + return DEFAULT_COVER def has_cover_image(self): if self.cover_image: @@ -999,18 +1008,18 @@ def authnames(self): @property def license(self): try: - return self.ebooks.all()[0].rights + return self.ebooks.first().rights except: return None @property def funding_info(self): - if self.ebooks.all().count() == 0: + if not self.ebooks.exists(): return '' if self.unglued: return 'The book is available as a free download thanks to the generous support of interested readers and organizations, who made donations using the crowd-funding website Unglue.it.' else: - if self.ebooks.all()[0].rights in cc.LICENSE_LIST: + if self.ebooks.first().rights in cc.LICENSE_LIST: return 'The book is available as a free download thanks to a Creative Commons license.' else: return 'The book is available as a free download because it is in the Public Domain.' @@ -1021,25 +1030,25 @@ def description(self): class EditionNote(models.Model): note = models.CharField(max_length=64, null=True, blank=True, unique=True) - def __unicode__(self): + def __str__(self): return self.note class Publisher(models.Model): created = models.DateTimeField(auto_now_add=True) - name = models.ForeignKey('PublisherName', related_name='key_publisher') + name = models.ForeignKey('PublisherName', on_delete=models.CASCADE, related_name='key_publisher') url = models.URLField(max_length=1024, null=True, blank=True) logo_url = models.URLField(max_length=1024, null=True, blank=True) description = models.TextField(default='', null=True, blank=True) - def __unicode__(self): + def __str__(self): return self.name.name class PublisherName(models.Model): name = models.CharField(max_length=255, blank=False, unique=True) - publisher = models.ForeignKey('Publisher', related_name='alternate_names', null=True) + publisher = models.ForeignKey('Publisher', on_delete=models.CASCADE, related_name='alternate_names', null=True) - def __unicode__(self): + def __str__(self): return self.name def save(self, *args, **kwargs): @@ -1052,17 +1061,17 @@ def save(self, *args, **kwargs): class WasWork(models.Model): - work = models.ForeignKey('Work') + work = models.ForeignKey('Work', on_delete=models.CASCADE) was = models.IntegerField(unique=True) moved = models.DateTimeField(auto_now_add=True) - user = models.ForeignKey(settings.AUTH_USER_MODEL, null=True) + user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, null=True) def safe_get_work(work_id): """ use this rather than querying the db directly for a work by id """ try: - work = Work.objects.get(id=work_id) + work = Work.objects.select_related('selected_edition').get(id=work_id) except Work.DoesNotExist: try: work = WasWork.objects.get(was=work_id).work @@ -1074,17 +1083,16 @@ def safe_get_work(work_id): return work def path_for_file(instance, filename): - return "ebf/{}.{}".format(uuid.uuid4().get_hex(), instance.format) + return f"ebf/{uuid.uuid4().hex}.{instance.format}" class EbookFile(models.Model): file = models.FileField(upload_to=path_for_file) format = models.CharField(max_length=25, choices=settings.FORMATS) - edition = models.ForeignKey('Edition', related_name='ebook_files') + edition = models.ForeignKey('Edition', on_delete=models.CASCADE, related_name='ebook_files') created = models.DateTimeField(auto_now_add=True) asking = models.BooleanField(default=False) - ebook = models.ForeignKey('Ebook', related_name='ebook_files', null=True) - source = models.URLField(null=True, blank=True) - mobied = models.IntegerField(default=0) #-1 indicates a failed conversion attempt + ebook = models.ForeignKey('Ebook', on_delete=models.CASCADE, related_name='ebook_files', null=True) + source = models.URLField(max_length=1024, null=True, blank=True) version = None def check_file(self): if self.format == 'epub': @@ -1098,47 +1106,14 @@ def active(self): except: return False - def make_mobi(self): - if not self.format == 'epub' or not settings.MOBIGEN_URL: - return False - if self.mobied < 0: - return False - try: - mobi_cf = ContentFile(mobi.convert_to_mobi(self.file.url)) - except: - self.mobied = -1 - self.save() - return False - new_mobi_ebf = EbookFile.objects.create( - edition=self.edition, - format='mobi', - asking=self.asking, - source=self.file.url - ) - new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf) - new_mobi_ebf.save() - if self.ebook: - new_ebook = Ebook.objects.create( - edition=self.edition, - format='mobi', - provider='Unglue.it', - url=new_mobi_ebf.file.url, - rights=self.ebook.rights, - version_label=self.ebook.version_label, - version_iter=self.ebook.version_iter, - ) - new_mobi_ebf.ebook = new_ebook - new_mobi_ebf.save() - self.mobied = 1 - self.save() - return True - send_to_kindle_limit = 7492232 class Ebook(models.Model): url = models.URLField(max_length=1024) #change to unique? created = models.DateTimeField(auto_now_add=True, db_index=True,) - format = models.CharField(max_length=25, choices=settings.FORMATS, blank=False) + format = models.CharField(max_length=25, + choices=settings.FORMATS + (('online', 'Online Only'),), + blank=False) provider = models.CharField(max_length=255) download_count = models.IntegerField(default=0) active = models.BooleanField(default=True) @@ -1148,8 +1123,8 @@ class Ebook(models.Model): # use 'PD-US', 'CC BY', 'CC BY-NC-SA', 'CC BY-NC-ND', 'CC BY-NC', 'CC BY-ND', 'CC BY-SA', 'CC0' rights = models.CharField(max_length=255, null=True, choices=cc.CHOICES, db_index=True) - edition = models.ForeignKey('Edition', related_name='ebooks') - user = models.ForeignKey(settings.AUTH_USER_MODEL, null=True) + edition = models.ForeignKey('Edition', on_delete=models.CASCADE, related_name='ebooks') + user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, null=True) def kindle_sendable(self): if not self.filesize or self.filesize < send_to_kindle_limit: @@ -1172,37 +1147,15 @@ def get_archive(self): # returns an open file return ebf.file def get_archive_ebf(self): # returns an ebf - if not self.ebook_files.filter(asking=False): - if not self.provider in good_providers: - return None - try: - r = urllib2.urlopen(self.url) - try: - self.filesize = int(r.info().getheaders("Content-Length")[0]) - if self.save: - self.filesize = self.filesize if self.filesize < 2147483647 else 2147483647 # largest safe positive integer - self.save() - ebf = EbookFile.objects.create( - edition=self.edition, - ebook=self, - format=self.format, - source=self.url - ) - ebf.file.save(path_for_file(ebf, None), ContentFile(r.read())) - ebf.file.close() - ebf.save() - return ebf - except IndexError: - # response has no Content-Length header probably a bad link - logging.error('Bad link error: {}'.format(self.url)) - except IOError: - logger.error(u'could not open {}'.format(self.url)) + if self.ebook_files.filter(asking=False): + ebf = self.ebook_files.filter(asking=False).last() + elif EbookFile.objects.filter(source=self.url, format=self.format): + ebf = self.ebook_files.filter(asking=False).last() else: - ebf = self.ebook_files.filter(asking=False).order_by('-created')[0] - if not self.filesize: - self.filesize = ebf.file.size - self.save() - return ebf + ebf, num = dl_online(self, format=self.format, force=True) + if not ebf: + return None + return ebf def set_provider(self): self.provider = Ebook.infer_provider(self.url) @@ -1214,7 +1167,7 @@ def version(self): return '.{}'.format(self.version_iter) else: return '().{}'.format(self.version_label, self.version_iter) - + def set_version(self, version): #set both version_label and version_iter with one string with format "version.iter" version_pattern = r'(.*)\.(\d+)$' @@ -1224,11 +1177,11 @@ def set_version(self, version): else: self.version_label = version self.save() - + def set_next_iter(self): # set the version iter to the next unused iter for that version for ebook in Ebook.objects.filter( - edition=self.edition, + edition=self.edition, version_label=self.version_label, format=self.format, provider=self.provider @@ -1237,7 +1190,7 @@ def set_next_iter(self): break self.version_iter = iter + 1 self.save() - + @property def rights_badge(self): if self.rights is None: @@ -1266,11 +1219,21 @@ def infer_provider(url): elif re.match(r'https?://www\.oapen\.org/download', url): provider = 'OAPEN Library' else: - provider = None + netloc = urlparse(url).netloc.lower() + if netloc in [u'dx.doi.org', u'doi.org', u'hdl.handle.net']: + try: + url = requests.get(url).url + except requests.exceptions.SSLError: + url = requests.get(url, verify=False).url + netloc = urlparse(url).netloc + if netloc.startswith('www.'): + netloc = netloc[4:] + provider = DOMAIN_TO_PROVIDER.get(netloc, netloc) return provider def increment(self): - Ebook.objects.filter(id=self.id).update(download_count=F('download_count') +1) + #Ebook.objects.filter(id=self.id).update(download_count=F('download_count') +1) + dllogger.info(f'{self.id}') @property def download_url(self): @@ -1279,7 +1242,7 @@ def download_url(self): def is_direct(self): return self.provider not in ('Google Books', 'Project Gutenberg') - def __unicode__(self): + def __str__(self): return "%s (%s from %s)" % (self.edition.title, self.format, self.provider) def deactivate(self): @@ -1295,20 +1258,42 @@ def set_free_flag(sender, instance, created, **kwargs): if not instance.edition.work.is_free and instance.active: instance.edition.work.is_free = True instance.edition.work.save() - elif not instance.active and instance.edition.work.is_free and instance.edition.work.ebooks().count() == 0: + for subject in instance.edition.work.subjects.all(): + subject.count_free() + elif not instance.active and instance.edition.work.is_free and not instance.edition.work.ebooks().exists(): instance.edition.work.is_free = False instance.edition.work.save() - elif instance.active and not instance.edition.work.is_free and instance.edition.work.ebooks().count() > 0: + for subject in instance.edition.work.subjects.all(): + subject.count_free() + + elif instance.active and not instance.edition.work.is_free and instance.edition.work.ebooks().exists(): instance.edition.work.is_free = True instance.edition.work.save() + for subject in instance.edition.work.subjects.all(): + subject.count_free() post_save.connect(set_free_flag, sender=Ebook) def reset_free_flag(sender, instance, **kwargs): # if the Work associated with the instance Ebook currenly has only 1 Ebook, then it's no longer a free Work # once the instance Ebook is deleted. - if instance.edition.work.ebooks().count() == 1: + if instance.active and instance.edition.work.ebooks().count() == 1: instance.edition.work.is_free = False instance.edition.work.save() + for subject in instance.edition.work.subjects.all(): + if subject.num_free > 0: + Subject.objects.filter(id=subject.id).update(num_free=F('num_free') - 1) pre_delete.connect(reset_free_flag, sender=Ebook) + +def check_free(sender, instance, action, model, pk_set, reverse, **kwargs): + if action in ['post_add', 'post_delete']: + if reverse: + for pk in pk_set: + subject = model.objects.get(pk=pk) + subject.count_free() + else: + instance.count_free() + +m2m_changed.connect(check_free, sender=Work.subjects.through) +m2m_changed.connect(check_free, sender=Subject.works.through) diff --git a/core/models/loader.py b/core/models/loader.py new file mode 100644 index 000000000..9246bffa3 --- /dev/null +++ b/core/models/loader.py @@ -0,0 +1,188 @@ +import logging +import re +import requests +import time +from urllib.parse import quote, unquote, urlparse, urlsplit, urlunsplit + +from django.apps import apps +from django.conf import settings +from django.core.files.base import ContentFile +from django.forms import ValidationError + +from regluit.core.validation import test_file +from regluit.core import models +#from . import Ebook, EbookFile + +#Ebook = apps.get_model('core', 'Ebook') +#EbookFile = apps.get_model('core', 'EbookFile') + +logger = logging.getLogger(__name__) + +def type_for_url(url, content_type=None, force=False, disposition=''): + url_disp = url + disposition + if not url: + return '' + + # check to see if we already know + for ebook in models.Ebook.objects.filter(url=url): + if ebook.format != 'online': + return ebook.format + + if not force: + if url.find('books.openedition.org') >= 0: + return 'online' + if content_type: + ct = content_type + else: + ct, disposition = contenttyper.calc_type(url) + url_disp = url + disposition + binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct) + if re.search("pdf", ct): + return "pdf" + elif binary_type and re.search("pdf", url_disp, flags=re.I): + return "pdf" + elif binary_type and re.search("epub", url_disp, flags=re.I): + return "epub" + elif binary_type and re.search("mobi", url_disp, flags=re.I): + return "mobi" + elif re.search("text/plain", ct): + return "text" + elif re.search("text/html", ct): + if url.find('oapen.org/view') >= 0: + return "html" + return "online" + elif re.search("epub", ct): + return "epub" + elif re.search("mobi", ct): + return "mobi" + elif ct == '404': + return ct + # no content-type header! + elif ct == '' and re.search("epub", url_disp, flags=re.I): + return "epub" + elif ct == '' and re.search("pdf", url_disp, flags=re.I): + return "pdf" + elif ct == '' and re.search("mobi", url_disp, flags=re.I): + return "mobi" + + return "other" + +def requote(url): + # fallback for non-ascii, non-utf8 bytes in redirect location + (scheme, netloc, path, query, fragment) = urlsplit(url) + try: + newpath = quote(unquote(path), encoding='latin1') + except UnicodeEncodeError as uee: + return '' + return urlunsplit((scheme, netloc, newpath, query, fragment)) + +class ContentTyper(object): + """ """ + def __init__(self): + self.last_call = dict() + + def content_type(self, url): + def handle_ude(url, ude): + url = requote(url) + try: + return requests.get(url, allow_redirects=True, timeout=(5, 60)) + except: + logger.error('Error processing %s after unicode error', url) + + try: + try: + r = requests.head(url, allow_redirects=True, timeout=(5, 60)) + if r.status_code == 405: + try: + r = requests.get(url, timeout=(5, 60)) + except UnicodeDecodeError as ude: + if 'utf-8' in str(ude): + r = handle_ude(url, ude) + except UnicodeDecodeError as ude: + if 'utf-8' in str(ude): + r = handle_ude(url, ude) + except requests.exceptions.SSLError: + try: + r = requests.get(url, verify=False, timeout=(5, 60)) + except: + logger.error('Error processing %s verification off', url) + return '', '' + except: + logger.error('Error processing %s', url) + return '', '' + if not r: + return '', '' + if r.status_code == 404: + logger.error('File not found (404) for %s', url) + return '404', '' + return r.headers.get('content-type', ''), r.headers.get('content-disposition', '') + + def calc_type(self, url): + logger.info(url) + # is there a delay associated with the url + netloc = urlparse(url).netloc + delay = 0.1 if 'oapen.org' in netloc else 1 + + # wait if necessary + last_call = self.last_call.get(netloc) + if last_call is not None: + now = time.time() + min_time_next_call = last_call + delay + if min_time_next_call > now: + time.sleep(min_time_next_call-now) + + self.last_call[netloc] = time.time() + + # compute the content-type + return self.content_type(url) + +contenttyper = ContentTyper() + +def load_ebookfile(url, format, user_agent=settings.USER_AGENT, method='GET', verify=True): + ''' + return a ContentFile, format if a new ebook has been loaded + ''' + ebfs = models.EbookFile.objects.filter(source=url) + if ebfs: + return None, '' + try: + if method == 'POST': + response = requests.post(url, headers={"User-Agent": user_agent}, verify=verify, timeout=(10, 60)) + else: + response = requests.get(url, headers={"User-Agent": user_agent}, verify=verify, timeout=(10, 60)) + + except requests.exceptions.SSLError: + logger.error('bad certificate? for %s', url) + return None, '' + except IOError as e: + logger.error('could not open %s', url) + return None, '' + except UnicodeDecodeError as e: + logger.error('decoding error for %s', url) + url = requote(url) + try: + response = requests.get(url, headers={"User-Agent": user_agent}, verify=verify, timeout=(10, 60)) + except: + return None, '' + + if response.status_code == 200: + logger.debug(response.headers.get('content-type', '')) + resp_format = type_for_url(url, + content_type=response.headers.get('content-type', ''), + disposition=response.headers.get('content-disposition', '')) + if resp_format == 'online' or (format != 'online' and resp_format != format): + logger.warning('response format %s for %s is not correct', resp_format, url) + return None, resp_format + else: + logger.warning('couldn\'t get %s', url) + return None, response.status_code + + contentfile = ContentFile(response.content) + try: + test_file(contentfile, resp_format) + return contentfile, resp_format + except ValidationError as e: + logger.error('downloaded %s was not a valid %s', url, format) + None, resp_format + + diff --git a/core/models/rh_models.py b/core/models/rh_models.py index 72ca471b1..a725941a7 100644 --- a/core/models/rh_models.py +++ b/core/models/rh_models.py @@ -13,9 +13,9 @@ class Claim(models.Model): (u'release', u'Claim has not been accepted.'), ) created = models.DateTimeField(auto_now_add=True) - rights_holder = models.ForeignKey("RightsHolder", related_name="claim", null=False) - work = models.ForeignKey("Work", related_name="claim", null=False) - user = models.ForeignKey(settings.AUTH_USER_MODEL, related_name="claim", null=False) + rights_holder = models.ForeignKey("RightsHolder", on_delete=models.CASCADE, related_name="claim", null=False) + work = models.ForeignKey("Work", on_delete=models.CASCADE, related_name="claim", null=False) + user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="claim", null=False) status = models.CharField(max_length=7, choices=STATUSES, default='active') @property @@ -33,7 +33,7 @@ def can_open_new(self): return 2 # can open a THANKS campaign return 1 # can open any type of campaign - def __unicode__(self): + def __str__(self): return self.work.title @property @@ -66,7 +66,7 @@ class RightsHolder(models.Model): created = models.DateTimeField(auto_now_add=True) email = models.CharField(max_length=100, blank=False, default='') rights_holder_name = models.CharField(max_length=100, blank=False) - owner = models.ForeignKey(settings.AUTH_USER_MODEL, related_name="rights_holder", null=False) + owner = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="rights_holder", null=False) approved = models.BooleanField(default=False) address = models.CharField(max_length=400, blank=False, default='') mailing = models.CharField(max_length=400, blank=False, default='') @@ -76,7 +76,7 @@ class RightsHolder(models.Model): signer_title = models.CharField(max_length=30, blank=False, default='') signature = models.CharField(max_length=100, blank=False, default='' ) - def __unicode__(self): + def __str__(self): return self.rights_holder_name def notify_rh(sender, created, instance, **kwargs): diff --git a/core/parameters.py b/core/parameters.py index a7fffae87..4c410e8ad 100644 --- a/core/parameters.py +++ b/core/parameters.py @@ -1,8 +1,10 @@ (REWARDS, BUY2UNGLUE, THANKS) = (1, 2, 3) (INDIVIDUAL, LIBRARY, BORROWED, RESERVE, THANKED) = (1, 2, 3, 4, 5) TESTING = 0 -OFFER_CHOICES = ((INDIVIDUAL,'Individual license'),(LIBRARY,'Library License')) -ACQ_CHOICES = ((INDIVIDUAL,'Individual license'),(LIBRARY,'Library License'),(BORROWED,'Borrowed from Library'), (TESTING,'Just for Testing'), (RESERVE,'On Reserve'),(THANKED,'Already Thanked'),) +OFFER_CHOICES = ((INDIVIDUAL, 'Individual license'),(LIBRARY, 'Library License')) +ACQ_CHOICES = ((INDIVIDUAL, 'Individual license'), (LIBRARY, 'Library License'), + (BORROWED, 'Borrowed from Library'), (TESTING, 'Just for Testing'), + (RESERVE, 'On Reserve'), (THANKED, 'Already Thanked'),) AGE_LEVEL_CHOICES = ( ('', 'No Rating'), @@ -16,25 +18,28 @@ ('18-', 'Adult/Advanced Reader') ) +DOWNLOADABLE = ('pdf', 'epub', 'mobi') + TEXT_RELATION_CHOICES = ( ('translation', 'translation'), ('revision', 'revision'), ('sequel', 'sequel'), - ('part', 'part') + ('part', 'part'), + ('unspecified', 'unspecified') ) ID_CHOICES = ( - ('http', 'Web Address'), + ('http', 'Web Address'), ('isbn', 'ISBN'), - ('doab', 'DOABooks ID'), + ('doab', 'DOABooks handle'), ('gtbg', 'Project Gutenberg Number'), ('doi', 'Digital Object Identifier'), ('oclc', 'OCLC Number'), ('goog', 'Google Books ID'), - ('gdrd', 'Goodreads ID'), ('thng', 'Library Thing ID'), ('olwk', 'Open Library Work ID'), ('ltwk', 'Library Thing Work ID'), + ('oapn', 'OAPEN ID'), ) OTHER_ID_CHOICES = ( @@ -42,12 +47,70 @@ ('edid', 'pragmatic edition ID'), ) -WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab') +WORK_IDENTIFIERS = ('doi', 'olwk', 'glue', 'ltwk', 'http', 'doab') ID_CHOICES_MAP = dict(ID_CHOICES) +GOOD_PROVIDERS = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO') +DOMAIN_TO_PROVIDER = dict([ + [u'adelaide.edu.au', u'University of Adelaide'], + [u'aliprandi.org', u'Simone Aliprandi'], + [u'antilia.to.it', u'antilia.to.it'], + [u'antropologie.zcu.cz', u'AntropoWeb'], + [u'aupress.ca', u'Athabasca University Press'], + [u'bloomsburyacademic.com', u'Bloomsbury Academic'], + [u'books.mdpi.com', u'MDPI Books'], + [u'books.openedition.org', u'OpenEdition Books'], + [u'books.scielo.org', u'SciELO'], + [u'ccdigitalpress.org', u'Computers and Composition Digital Press'], + [u'co-action.net', u'Co-Action Publishing'], + [u'degruyter.com', u'De Gruyter Online'], + [u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'], + [u'dl.dropboxusercontent.com', u'Dropbox'], + [u'doabooks.org', u'Directory of Open Access Books'], + [u'doi.org', u'DOI Resolver'], + [u'dropbox.com', u'Dropbox'], + [u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'], + [u'dx.doi.org', u'DOI Resolver'], + [u'ebooks.iospress.nl', u'IOS Press Ebooks'], + [u'hdl.handle.net', u'Handle Proxy'], + [u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'], + [u'img.mdpi.org', u'MDPI Books'], + [u'ledibooks.com', u'LediBooks'], + [u'ledizioni.it', u'Ledizioni'], + [u'leo.cilea.it', u'LEO '], + [u'leo.cineca.it', u'Letteratura Elettronica Online'], + [u'library.oapen.org', u'OAPEN Library'], + [u'link.springer.com', u'Springer'], + [u'maestrantonella.it', u'maestrantonella.it'], + [u'oapen.org', u'OAPEN Library'], + [u'openbookpublishers.com', u'Open Book Publishers'], + [u'palgraveconnect.com', u'Palgrave Connect'], + [u'press.openedition.org', u'OpenEdition Press'], + [u'scribd.com', u'Scribd'], + [u'springerlink.com', u'Springer'], + [u'transcript-verlag.de', u'Transcript-Verlag'], + [u'ubiquitypress.com', u'Ubiquity Press'], + [u'unglueit-files.s3.amazonaws.com', u'Unglue.it'], + [u'unimib.it', u'University of Milano-Bicocca'], + [u'unito.it', u"University of Turin"], + [u'windsor.scholarsportal.info', u'Scholars Portal'], +]) +ORDER_BY_KEYS = { + 'newest':['-featured', '-created'], + 'oldest':['created'], + 'featured':['-featured', '-num_wishes'], + 'popular':['-num_wishes'], + 'title':['title'], + 'none':[], #no ordering +} +MAX_FACETS = 2 +DONATION_CHOICES = ( + ('general', 'The FEF General Fund'), + ('monographs', 'The FEF Open Access Monographs Fund'), +) diff --git a/core/pdf.py b/core/pdf.py index 0931e46b6..b43430708 100644 --- a/core/pdf.py +++ b/core/pdf.py @@ -2,34 +2,36 @@ Utilities that manipulate pdf files """ import logging +from io import BytesIO, StringIO +from tempfile import NamedTemporaryFile + import requests from xhtml2pdf import pisa # import python module -from PyPDF2 import PdfFileMerger,PdfFileReader -from StringIO import StringIO -from tempfile import NamedTemporaryFile +from pypdf import PdfWriter, PdfReader +from pypdf.errors import PdfReadError +from pypdf import PageRange from django.template.loader import render_to_string -from regluit import settings +from django.conf import settings logger = logging.getLogger(__name__) # Utility function def ask_pdf(context={}): - ask_html = StringIO(unicode(render_to_string('pdf/ask.html', context))) + ask_html = StringIO(str(render_to_string('pdf/ask.html', context))) # open output file for writing (truncated binary) - resultFile = StringIO() + resultFile = BytesIO() # convert HTML to PDF pisaStatus = pisa.CreatePDF( - src=ask_html, # the HTML to convert - dest=resultFile) # file to recieve result - + src=ask_html, # the HTML to convert + dest=resultFile, # file to recieve result + ) # True on success and False on errors assert pisaStatus.err == 0 return resultFile -def pdf_append( file1, file2, file_out ): - merger = PdfFileMerger(strict=False) - merger.append(file1) +def pdf_append(file1, file2, file_out): + merger = PdfWriter(file1) merger.append(file2) merger.write(file_out) merger.close() @@ -37,7 +39,7 @@ def pdf_append( file1, file2, file_out ): def test_pdf(pdf_file): temp = None try: - if isinstance(pdf_file , (str, unicode)): + if isinstance(pdf_file, str): if pdf_file.startswith('http:') or pdf_file.startswith('https:'): temp = NamedTemporaryFile(delete=False) test_file_content = requests.get(pdf_file).content @@ -50,7 +52,7 @@ def test_pdf(pdf_file): pdf_file.seek(0) temp = pdf_file try: - PdfFileReader(temp) + PdfReader(temp) success = True except: success = False @@ -60,7 +62,36 @@ def test_pdf(pdf_file): logger.exception('error testing a pdf: %s' % pdf_file[:100]) return False -def test_test_pdf(self): +def staple_pdf(urllist, user_agent=settings.USER_AGENT, strip_covers=0): + pages = None + all_but_cover = PageRange('%s:' % int(strip_covers)) + merger = PdfWriter(None) + s = requests.Session() + for url in urllist: + try: + response = s.get(url, headers={"User-Agent": user_agent}) + except requests.exceptions.ConnectionError: + logger.error("Error getting url: %s", url) + return None + if response.status_code == 200: + try: + logger.debug('adding %s bytes from %s', len(response.content), url) + merger.append(BytesIO(response.content), pages=pages) + except PdfReadError: + logger.error("error reading pdf url: %s", url) + return None + else: + return None + pages = all_but_cover if strip_covers else pages + out = BytesIO() + try: + merger.write(out) + except (PdfReadError, RecursionError): + logger.error("error writing pdf url: %s", url) + return None + return out + +def test_test_pdf(): assert(test_pdf(settings.TEST_PDF_URL)) temp = NamedTemporaryFile(delete=False) test_file_content = requests.get(settings.TEST_PDF_URL).content diff --git a/core/search.py b/core/search.py index a2fd24b7a..00820baee 100644 --- a/core/search.py +++ b/core/search.py @@ -1,52 +1,58 @@ import re import json -import requests -import regluit.core.isbn +import requests from django.conf import settings +from regluit.core.covers import DEFAULT_COVER +import regluit.core.isbn + def gluejar_search(q, user_ip='69.243.24.29', page=1): """normalizes results from the google books search suitable for gluejar """ results = [] - search_result=googlebooks_search(q, user_ip, page) + search_result = googlebooks_search(q, user_ip, page) if 'items' in search_result.keys(): for item in search_result['items']: v = item['volumeInfo'] - r = {'title': v.get('title', ""), + r = {'title': v.get('title', ""), 'description': v.get('description', ""), 'publisher': v.get('publisher', ""), 'googlebooks_id': item.get('id')} - + # TODO: allow multiple authors - if v.has_key('authors') and len(v['authors']) == 1 : + if 'authors' in v and len(v['authors']) == 1: r['author'] = r['authors_short'] = v['authors'][0] - elif v.has_key('authors') and len(v['authors']) > 2: + elif 'authors' in v and len(v['authors']) > 2: r['author'] = v['authors'][0] - r['authors_short'] = '%s et al.' % v['authors'][0] - elif v.has_key('authors') and len(v['authors']) == 2: + r['authors_short'] = '%s et al.' % v['authors'][0] + elif 'authors' in v and len(v['authors']) == 2: r['author'] = v['authors'][0] - r['authors_short'] = '%s and %s' % (v['authors'][0], v['authors'][1]) + r['authors_short'] = '%s and %s' % (v['authors'][0], v['authors'][1]) else: r['author'] = "" r['isbn_13'] = None - + # pull out isbns for i in v.get('industryIdentifiers', []): if i['type'] == 'ISBN_13': r['isbn_13'] = i['identifier'] elif i['type'] == 'ISBN_10': - if not r['isbn_13'] : + if not r['isbn_13']: r['isbn_13'] = regluit.core.isbn.convert_10_to_13(i['identifier']) - + # cover image - if v.has_key('imageLinks'): + if 'imageLinks' in v: url = v['imageLinks'].get('thumbnail', "") - url = re.sub(r'http://(bks[0-9]+\.)?books\.google\.com', 'https://encrypted.google.com', url) + url = re.sub( + r'http://(bks[0-9]+\.)?books\.google\.com', + 'https://encrypted.google.com', + url, + ) r['cover_image_thumbnail'] = url else: - r['cover_image_thumbnail'] = "/static/images/generic_cover_larger.png" - + r['cover_image_thumbnail'] = DEFAULT_COVER + access_info = item.get('accessInfo') if access_info: epub = access_info.get('epub') @@ -56,20 +62,23 @@ def gluejar_search(q, user_ip='69.243.24.29', page=1): if pdf and pdf.get('downloadLink'): r['first_pdf_url'] = pdf['downloadLink'] results.append(r) - return results + return results def googlebooks_search(q, user_ip, page): if len(q) < 2 or len(q) > 2000: return {} - # XXX: need to pass IP address of user in from the frontend + # XXX: need to pass IP address of user in from the frontend headers = {'X-Forwarded-For': user_ip} - start = (page - 1) * 10 + start = (page - 1) * 10 params = {'q': q, 'startIndex': start, 'maxResults': 10} if hasattr(settings, 'GOOGLE_BOOKS_API_KEY'): params['key'] = settings.GOOGLE_BOOKS_API_KEY - - r = requests.get('https://www.googleapis.com/books/v1/volumes', - params=params, headers=headers) + + r = requests.get( + 'https://www.googleapis.com/books/v1/volumes', + params=params, + headers=headers + ) # urls like https://www.googleapis.com/books/v1/volumes?q=invisible+engines&startIndex=0&maxResults=10&key=[key] return json.loads(r.content) diff --git a/core/signals.py b/core/signals.py index 96f94cf18..2dee7ab66 100644 --- a/core/signals.py +++ b/core/signals.py @@ -5,8 +5,6 @@ import itertools import logging -from tastypie.models import create_api_key - """ django imports """ @@ -25,14 +23,14 @@ from django.utils.timezone import now from notification import models as notification +from registration.signals import user_activated """ regluit imports """ from regluit.payment.signals import transaction_charged, transaction_failed, pledge_modified, pledge_created -from regluit.core.parameters import REWARDS, BUY2UNGLUE, THANKS, LIBRARY, RESERVE, THANKED -from regluit.libraryauth.models import Library, LibraryUser -from regluit.utils.localdatetime import date_today +from regluit.core.parameters import REWARDS, THANKS, THANKED +from regluit.libraryauth.models import LibraryUser logger = logging.getLogger(__name__) @@ -48,7 +46,8 @@ def create_user_objects(sender, created, instance, **kwargs): if created: Wishlist.objects.create(user=instance) profile = UserProfile.objects.create(user=instance) - profile.ml_subscribe() + if instance.social_auth.exists(): + instance.profile.ml_subscribe() except DatabaseError: # this can happen when creating superuser during syncdb since the # core_wishlist table doesn't exist yet @@ -57,10 +56,6 @@ def create_user_objects(sender, created, instance, **kwargs): post_save.connect(create_user_objects, sender=User) -# create API key for new User -post_save.connect(create_api_key, sender=User) - - # create notification types (using django-notification) -- tie to syncdb def create_notice_types( **kwargs): @@ -173,36 +168,6 @@ def handle_transaction_charged(sender,transaction=None, **kwargs): send_mail_task.delay('unglue.it donation confirmation', message, 'notices@gluejar.com', [transaction.receipt]) elif transaction.campaign.type is REWARDS: notification.send([transaction.user], "pledge_charged", context, True) - elif transaction.campaign.type is BUY2UNGLUE: - # provision the book - Acq = apps.get_model('core', 'Acq') - if transaction.offer.license == LIBRARY: - library = Library.objects.get(id=transaction.extra['library_id']) - new_acq = Acq.objects.create(user=library.user,work=transaction.campaign.work,license= LIBRARY) - if transaction.user_id != library.user_id: # don't put it on reserve if purchased by the library - reserve_acq = Acq.objects.create(user=transaction.user,work=transaction.campaign.work,license= RESERVE, lib_acq = new_acq) - reserve_acq.expire_in(datetime.timedelta(hours=2)) - copies = int(transaction.extra.get('copies',1)) - while copies > 1: - Acq.objects.create(user=library.user,work=transaction.campaign.work,license= LIBRARY) - copies -= 1 - else: - if transaction.extra.get('give_to', False): - # it's a gift! - Gift = apps.get_model('core', 'Gift') - giftee = Gift.giftee(transaction.extra['give_to'], str(transaction.id)) - new_acq = Acq.objects.create(user=giftee, work=transaction.campaign.work, license= transaction.offer.license) - gift = Gift.objects.create(acq=new_acq, message=transaction.extra.get('give_message',''), giver=transaction.user , to = transaction.extra['give_to']) - context['gift'] = gift - notification.send([giftee], "purchase_gift", context, True) - else: - new_acq = Acq.objects.create(user=transaction.user,work=transaction.campaign.work,license= transaction.offer.license) - transaction.campaign.update_left() - notification.send([transaction.user], "purchase_complete", context, True) - from regluit.core.tasks import watermark_acq - watermark_acq.delay(new_acq) - if transaction.campaign.cc_date < date_today() : - transaction.campaign.update_status(send_notice=True) elif transaction.campaign.type is THANKS: if transaction.user: Acq = apps.get_model('core', 'Acq') @@ -300,7 +265,7 @@ def handle_wishlist_added(supporter, work, **kwargs): from regluit.core.tasks import emit_notifications emit_notifications.delay() - + wishlist_added.connect(handle_wishlist_added) deadline_impending = Signal(providing_args=["campaign"]) @@ -349,4 +314,11 @@ def notify_join_library(sender, created, instance, **kwargs): 'user': instance.user, }) -post_save.connect(notify_join_library, sender=LibraryUser) \ No newline at end of file +post_save.connect(notify_join_library, sender=LibraryUser) + +from registration.signals import user_activated + +def ml_subscribe(user, request, **kwargs): + user.profile.ml_subscribe() + +user_activated.connect(ml_subscribe) \ No newline at end of file diff --git a/core/sitemaps.py b/core/sitemaps.py index 0b29e44a1..605645a64 100644 --- a/core/sitemaps.py +++ b/core/sitemaps.py @@ -1,5 +1,5 @@ from django.contrib.sitemaps import Sitemap -from django.core.urlresolvers import reverse +from django.urls import reverse from regluit.core.models import Work, Edition class WorkSitemap(Sitemap): @@ -7,17 +7,7 @@ class WorkSitemap(Sitemap): limit = 10000 def items(self): - return Work.objects.all() + return Work.objects.filter(is_free=True) def priority(self,work): return '{:.1f}'.format(work.priority()/5.0) - -class PublisherSitemap(Sitemap): - priority = 0.2 - protocol = 'https' - - def items(self): - return Edition.objects.exclude(publisher_name__isnull=True).order_by('publisher_name__name').values('publisher_name').distinct() - - def location(self, pub): - return reverse("bypubname_list",args=[pub['publisher_name']]) diff --git a/core/tasks.py b/core/tasks.py index 36dc8f2bd..f5fe2defc 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -1,37 +1,37 @@ -""" -external library imports -""" +# +# external library imports +# import logging +import random -from celery.task import task from datetime import timedelta from time import sleep +from celery.task import task -""" -django imports -""" +# +# django imports +# from django.conf import settings from django.contrib.auth.models import User from django.core.mail import send_mail +from django.core.management import call_command from django.utils.timezone import now from notification.engine import send_all from notification import models as notification from mailchimp3 import MailChimp -from mailchimp3.mailchimpclient import MailChimpError -""" -regluit imports -""" +# +# regluit imports +# from regluit.core import ( bookloader, + covers, models, - goodreads, - librarything, - mobigen + librarything ) -from regluit.core.models import Campaign, Acq, Gift +from regluit.core.models import Acq, Campaign, EbookFile, Gift, UserProfile, Work from regluit.core.signals import deadline_impending from regluit.core.parameters import RESERVE, REWARDS, THANKS from regluit.utils.localdatetime import date_today @@ -39,39 +39,33 @@ logger = logging.getLogger(__name__) mc_client = MailChimp(mc_api=settings.MAILCHIMP_API_KEY) -@task +@task def populate_edition(isbn): """given an edition this task will populate the database with additional information about related editions and subjects related to this edition """ bookloader.add_related(isbn) - edition=models.Edition.get_by_isbn(isbn) + edition = models.Edition.get_by_isbn(isbn) if edition: bookloader.add_openlibrary(edition.work) return edition -@task -def load_goodreads_shelf_into_wishlist(user_id, shelf_name='all', goodreads_user_id=None, max_books=None, - expected_number_of_books=None): - user=User.objects.get(id=user_id) - return goodreads.load_goodreads_shelf_into_wishlist(user,shelf_name,goodreads_user_id,max_books, expected_number_of_books) - @task def load_librarything_into_wishlist(user_id, lt_username, max_books=None): - user=User.objects.get(id=user_id) + user = User.objects.get(id=user_id) return librarything.load_librarything_into_wishlist(user, lt_username, max_books) - + @task def fac(n, sleep_interval=None): - # used to test celery task execution - if not(isinstance(n,int) and n >= 0): + # used to test celery task execution + if not(isinstance(n, int) and n >= 0): raise Exception("You can't calculate a factorial of %s " % (str(n))) if n <= 1: return 1 else: res = 1 - for i in xrange(2,n+1): - res = res*i + for i in range(2, n+1): + res = res * i fac.update_state(state="PROGRESS", meta={"current": i, "total": n}) if sleep_interval is not None: sleep(sleep_interval) @@ -80,34 +74,39 @@ def fac(n, sleep_interval=None): @task def send_mail_task(subject, message, from_email, recipient_list, - fail_silently=False, auth_user=None, auth_password=None, - connection=None, override_from_email=True): + fail_silently=False, auth_user=None, auth_password=None, + connection=None, override_from_email=True): """a task to drop django.core.mail.send_mail into """ - # NOTE: since we are currently using Amazon SES, which allows email to be sent only from validated email - # addresses, we force from_email to be one of the validated address unless override_from_email is FALSE + # NOTE: since we are currently using Amazon SES, which allows email to be sent only from + # validated email addresses, we force from_email to be one of the validated + # address unless override_from_email is FALSE try: if override_from_email: try: from_email = settings.DEFAULT_FROM_EMAIL except: pass - r= send_mail(subject, message, from_email, recipient_list, fail_silently=False, auth_user=auth_user, - auth_password=auth_password, connection=connection) + r = send_mail(subject, message, from_email, recipient_list, fail_silently=False, + auth_user=auth_user, auth_password=auth_password, connection=connection) + logger.info('sent mail about %s to %s', subject, recipient_list) except: - r=logger.info('failed to send message:' + message) + r = logger.info('failed to send message:', message) return r - + #task to update the status of active campaigns @task def update_active_campaign_status(): """update the status of all active campaigns -- presumed to be run at midnight Eastern time""" - return [c.update_status(send_notice=True, ignore_deadline_for_success=True, process_transactions=True) for c in Campaign.objects.filter(status='Active') ] + return [ + c.update_status(send_notice=True, ignore_deadline_for_success=True, + process_transactions=True) for c in Campaign.objects.filter(status='Active') + ] @task def emit_notifications(): - logger.info('notifications emitting' ) - return send_all() - + logger.info('notifications emitting') + send_all() + @task def report_new_ebooks(created=None): #created= creation date if created: @@ -126,51 +125,55 @@ def report_new_ebooks(created=None): #created= creation date True ) break - + @task def notify_ending_soon(): c_active = Campaign.objects.filter(status='Active', type=REWARDS) for c in c_active: if c.deadline - now() < timedelta(7) and c.deadline - now() >= timedelta(6): - """ - if the campaign is still active and there's only a week left until it closes, send reminder notification - """ + # if the campaign is still active and there's only a week left until it closes, + # send reminder notification deadline_impending.send(sender=None, campaign=c) @task -def watermark_acq(acq): +def watermark_acq(acq_id): + try: + acq = Acq.objects.get(acq_id) + except Acq.DoesNotExist as e: + logger.error("error getting acq %s", acq_id) + return False acq.get_watermarked() - + @task -def process_ebfs(campaign): +def process_ebfs(campaign_id): + try: + campaign = Campaign.objects.get(campaign_id) + except Campaign.DoesNotExist as e: + logger.error("error getting acq %s", campaign_id) + return False if campaign.type == THANKS: if campaign.use_add_ask: campaign.add_ask_to_ebfs() else: campaign.revert_asks() - campaign.make_mobis() - -@task -def make_mobi(ebookfile): - return ebookfile.make_mobi() - + @task def refresh_acqs(): in_10_min = now() + timedelta(minutes=10) acqs = Acq.objects.filter(refreshed=False, refreshes__lt=in_10_min) - logger.info('refreshing %s acqs' % acqs.count()) + logger.info('refreshing %s acqs', acqs.count()) for acq in acqs: for hold in acq.holds: # create a 1 day reserve on the acq - reserve_acq = Acq.objects.create( - user = hold.user, - work = hold.work, - license = RESERVE, - lib_acq = acq, - ) + reserve_acq = Acq.objects.create( + user=hold.user, + work=hold.work, + license=RESERVE, + lib_acq=acq, + ) # the post_save handler takes care of pushing expires vis acq.expires_in - + # notify the user with the hold if 'example.org' not in reserve_acq.user.email: notification.send_now([reserve_acq.user], "library_reserve", {'acq':reserve_acq}) @@ -181,15 +184,13 @@ def refresh_acqs(): acq.refreshed = True @task -def convert_to_mobi(input_url, input_format="application/epub+zip"): - return mobigen.convert_to_mobi(input_url, input_format) - -@task -def generate_mobi_ebook_for_edition(edition): - return mobigen.generate_mobi_ebook_for_edition(edition) +def ml_subscribe_task(profile_id, **kwargs): + try: + profile = UserProfile.objects.get(profile_id) + except UserProfile.DoesNotExist as e: + logger.error("error getting profile %s", profile_id) + return False -@task -def ml_subscribe_task(profile, **kwargs): try: if not profile.on_ml: data = {"email_address": profile.user.email, "status_if_new": "pending"} @@ -199,20 +200,36 @@ def ml_subscribe_task(profile, **kwargs): data=data, ) return True - except Exception, e: - logger.error("error subscribing to mailchimp list %s" % (e)) + except Exception as e: + logger.error("error subscribing to mailchimp list %s", e) return False @task def notify_unclaimed_gifts(): unclaimed = Gift.objects.filter(used=None) for gift in unclaimed: - """ - send notice every 7 days, but stop at 10x - """ - unclaimed_duration = (now() - gift.acq.created ).days + # send notice every 7 days, but stop at 10x + unclaimed_duration = (now() - gift.acq.created).days if unclaimed_duration > 70: return - if unclaimed_duration > 0 and unclaimed_duration % 7 == 0 : # first notice in 7 days + if unclaimed_duration > 0 and unclaimed_duration % 7 == 0: # first notice in 7 days notification.send_now([gift.acq.user], "purchase_gift_waiting", {'gift':gift}, True) notification.send_now([gift.giver], "purchase_notgot_gift", {'gift':gift}, True) + +@task +def periodic_cleanup(): + call_command('clearsessions') + call_command('cleanupregistration') + +@task +def feature_new_work(): + works = Work.objects.filter(is_free=True, featured__isnull=True).order_by('-num_wishes') + work = works[random.randrange(0, 50)] + work.featured = now() + work.save() + +@task +def make_cover_thumbnail(url, geom_string, **options): + success = covers.make_cover_thumbnail(url, geom_string, **options) + logger.error('bad cover image %s: %s', url) + \ No newline at end of file diff --git a/core/tests.py b/core/tests.py index 5c5050bd7..9e34a77c0 100755 --- a/core/tests.py +++ b/core/tests.py @@ -6,15 +6,17 @@ from decimal import Decimal as D from math import factorial import unittest -from urlparse import parse_qs, urlparse +from urllib.parse import urlparse, parse_qs from tempfile import NamedTemporaryFile from time import sleep, mktime -from celery.task.sets import TaskSet +from celery import group import requests import requests_mock +from pyepub import EPUB #django imports +from django.apps import apps from django.conf import settings from django.contrib.auth.models import User from django.contrib.contenttypes.models import ContentType @@ -30,26 +32,29 @@ from django_comments.models import Comment -#regluit imports +from regluit.payment.models import Transaction +from regluit.payment.parameters import PAYMENT_TYPE_AUTHORIZATION +from regluit.utils.localdatetime import date_today -from regluit.core import ( - isbn, +from . import ( bookloader, + covers, + isbn, + librarything, models, + parameters, search, - goodreads, - librarything, tasks, - parameters, ) -from regluit.core.models import ( +from .epub import test_epub +from .loaders.utils import (load_from_books, loaded_book_ok, ) +from .models import ( Campaign, Work, UnglueitError, Edition, RightsHolder, Claim, - Key, Ebook, Premium, Subject, @@ -58,18 +63,11 @@ EbookFile, Acq, Hold, + safe_get_work, ) -from regluit.libraryauth.models import Library -from regluit.core.parameters import TESTING, LIBRARY, RESERVE -from regluit.core.loaders.utils import (load_from_books, loaded_book_ok, ) -from regluit.core.validation import valid_subject -from regluit.frontend.views import safe_get_work -from regluit.payment.models import Transaction -from regluit.payment.parameters import PAYMENT_TYPE_AUTHORIZATION -from regluit.pyepub import EPUB -from regluit.utils.localdatetime import date_today -from .epub import test_epub +from .parameters import TESTING, LIBRARY, RESERVE from .pdf import test_pdf +from .validation import valid_subject TESTDIR = os.path.join(os.path.dirname(__file__), '../test/') YAML_VERSIONFILE = os.path.join(TESTDIR, 'versiontest.yaml') @@ -111,7 +109,7 @@ def test_add_by_yaml(self): def test_add_by_isbn_mock(self): with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, 'gb_hamilton.json')) as gb: + with open(os.path.join(TESTDIR, 'gb_hamilton.json'), 'rb') as gb: m.get('https://www.googleapis.com/books/v1/volumes', content=gb.read()) self.test_add_by_isbn(mocking=True) @@ -120,22 +118,22 @@ def test_add_by_isbn(self, mocking=False): if not (mocking or settings.TEST_INTEGRATION): return # edition - edition = bookloader.add_by_isbn('9780143034759') + edition = bookloader.add_by_isbn('9781101200858') self.assertEqual(edition.title, u'Alexander Hamilton') - self.assertEqual(edition.publication_date, u'2005') + self.assertTrue('2005' in edition.publication_date) self.assertEqual(edition.publisher, u'Penguin') - self.assertEqual(edition.isbn_10, '0143034758') - self.assertEqual(edition.isbn_13, '9780143034759') - self.assertEqual(edition.googlebooks_id, '4iafgTEhU3QC') + self.assertEqual(edition.isbn_10, '1101200855') + self.assertEqual(edition.isbn_13, '9781101200858') + self.assertTrue(edition.googlebooks_id in ('4z5eL5SGjEoC', '4iafgTEhU3QC')) # authors self.assertEqual(edition.authors.all().count(), 1) - self.assertEqual(edition.authors.all()[0].name, u'Ron Chernow') + self.assertEqual(edition.authors.first().name, u'Ron Chernow') # work self.assertTrue(edition.work) - self.assertEqual(edition.work.googlebooks_id, '4iafgTEhU3QC') - self.assertEqual(edition.work.first_isbn_13(), '9780143034759') + self.assertTrue(edition.googlebooks_id in ('4z5eL5SGjEoC', '4iafgTEhU3QC')) + self.assertEqual(edition.work.first_isbn_13(), '9781101200858') # test duplicate pubname ed2 = Edition.objects.create(work=edition.work) @@ -155,19 +153,19 @@ def test_add_by_isbn(self, mocking=False): def test_language_locale_mock(self): with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, 'zhCN.json')) as gb: + with open(os.path.join(TESTDIR, 'gb_zhCN.json'), 'rb') as gb: m.get('https://www.googleapis.com/books/v1/volumes', content=gb.read()) self.test_language_locale(mocking=True) def test_language_locale(self, mocking=False): if not (mocking or settings.TEST_INTEGRATION): return - edition = bookloader.add_by_isbn('9787104030126') + edition = bookloader.add_by_isbn('9787115401519') self.assertEqual(edition.work.language, u'zh-CN') def test_update_edition_mock(self): with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, 'python4da.json')) as gb: + with open(os.path.join(TESTDIR, 'python4da.json'), 'rb') as gb: m.get('https://www.googleapis.com/books/v1/volumes', content=gb.read()) self.test_update_edition(mocking=True) @@ -198,7 +196,7 @@ def test_missing_isbn(self): def test_thingisbn_mock(self): with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, '9780441569595.xml')) as lt: + with open(os.path.join(TESTDIR, '9780441569595.xml'), 'rb') as lt: m.get('https://www.librarything.com/api/thingISBN/0441007465', content=lt.read()) self.test_thingisbn(mocking=True) @@ -220,7 +218,7 @@ def test_add_related(self): langbefore = models.Work.objects.filter(language=lang).count() # ask for related editions to be added using the work we just created with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, '9780441569595.xml')) as lt: + with open(os.path.join(TESTDIR, '9780441569595.xml'), 'rb') as lt: m.get('https://www.librarything.com/api/thingISBN/0441007465', content=lt.read()) bookloader.add_related('0441007465') # should join the editions self.assertTrue(models.Edition.objects.count() >= edbefore) @@ -241,7 +239,7 @@ def test_add_related(self): def test_populate_edition(self): edition = bookloader.add_by_isbn('9780606301121') # A People's History Of The United States with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, '9780061989834.xml')) as lt: + with open(os.path.join(TESTDIR, '9780061989834.xml'), 'rb') as lt: m.get('https://www.librarything.com/api/thingISBN/9780606301121', content=lt.read()) edition = tasks.populate_edition.run(edition.isbn_13) self.assertTrue(edition.work.editions.all().count() > 10) @@ -291,6 +289,8 @@ def test_merge_works_mechanics(self): self.assertTrue(w2.is_free) self.assertFalse(w1.is_free) + sub2 = Subject.objects.get(pk=sub2.pk) + self.assertEqual(sub2.num_free, 1) w1_id = w1.id w2_id = w2.id @@ -310,7 +310,8 @@ def test_merge_works_mechanics(self): self.assertEqual(models.Work.objects.count(), before + 1) self.assertEqual(models.WasWork.objects.count(), wasbefore + 1) self.assertEqual(w1.subjects.count(), 2) - + sub1 = Subject.objects.get(pk=sub1.pk) + self.assertEqual(sub2.num_free, 1) self.assertTrue(w1.is_free) # getting proper view? @@ -372,7 +373,7 @@ def test_merge_works(self): c2.save() self.assertEqual(c2.pk, e2.work.last_campaign().pk) # comment on the works - site = Site.objects.all()[0] + site = Site.objects.first() wct = ContentType.objects.get_for_model(models.Work) comment1 = Comment( content_type=wct, @@ -423,14 +424,14 @@ def test_merge_works(self): def test_ebook(self): with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, 'gb_latinlanguage.json')) as gb: + with open(os.path.join(TESTDIR, 'gb_latinlanguage.json'), 'rb') as gb: m.get('https://www.googleapis.com/books/v1/volumes', content=gb.read()) edition = bookloader.add_by_oclc('1246014') # we've seen the public domain status of this book fluctuate -- and the OCLC # number can disappear. So if the ebook count is 2 then test #if edition is not None and edition.ebooks.count() == 2: self.assertEqual(edition.ebooks.count(), 2) - #ebook_epub = edition.ebooks.all()[0] + #ebook_epub = edition.ebooks.first() ebook_epub = edition.ebooks.filter(format='epub')[0] self.assertEqual(ebook_epub.format, 'epub') self.assertEqual(parse_qs(urlparse(ebook_epub.url).query).get("id"), ['N1RfAAAAMAAJ']) @@ -459,15 +460,15 @@ def test_ebook(self): ebook_pdf.increment() updated_ebook = Ebook.objects.get(pk=ebook_pdf.pk) - self.assertEqual(int(updated_ebook.download_count), 1) - self.assertEqual(int(edition.work.download_count), 1) + #self.assertEqual(int(updated_ebook.download_count), 1) + #self.assertEqual(int(edition.work.download_count), 1) def test_add_no_ebook(self): # this edition lacks an ebook, but we should still be able to load it # http://books.google.com/books?id=D-WjL_HRbNQC&printsec=frontcover#v=onepage&q&f=false # Social Life of Information with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, 'gb_sociallife.json')) as gb: + with open(os.path.join(TESTDIR, 'gb_sociallife.json'), 'rb') as gb: m.get('https://www.googleapis.com/books/v1/volumes', content=gb.read()) e = bookloader.add_by_isbn('1578517087') self.assertTrue(e) @@ -480,12 +481,11 @@ def test_add_openlibrary(self): self.assertTrue(len(subjects) > 10) self.assertTrue('Science fiction' in subjects) self.assertTrue('/works/OL27258W' in work.identifiers.filter(type='olwk').values_list('value', flat=True)) - self.assertTrue('888628' in work.identifiers.filter(type='gdrd').values_list('value', flat=True)) self.assertTrue('609' in work.identifiers.filter(type='ltwk').values_list('value', flat=True)) def test_unicode_openlibrary(self): with requests_mock.Mocker(real_http=True) as m: - with open(os.path.join(TESTDIR, 'gb_fightclub.json')) as gb: + with open(os.path.join(TESTDIR, 'gb_fightclub.json'), 'rb') as gb: m.get('https://www.googleapis.com/books/v1/volumes', content=gb.read()) work = bookloader.add_by_isbn('9783894808358').work #fight club bookloader.add_openlibrary(work) @@ -526,8 +526,8 @@ class SearchTests(TestCase): def test_search_mock(self): with requests_mock.Mocker(real_http=True) as m: with open( - os.path.join(TESTDIR, 'gb_melville.json') - ) as gb, open(os.path.join(TESTDIR, 'gb_melville2.json')) as gb2: + os.path.join(TESTDIR, 'gb_melville.json'), 'rb' + ) as gb, open(os.path.join(TESTDIR, 'gb_melville2.json'), 'rb') as gb2: m.get( 'https://www.googleapis.com/books/v1/volumes', [{'content':gb2.read()}, {'content':gb.read()}] @@ -543,17 +543,17 @@ def test_basic_search(self, mocking=False): self.assertEqual(len(results), 10) r = results[0] - self.assertTrue(r.has_key('title')) - self.assertTrue(r.has_key('author')) - self.assertTrue(r.has_key('description')) - self.assertTrue(r.has_key('cover_image_thumbnail')) + self.assertTrue('title' in r) + self.assertTrue('author' in r) + self.assertTrue('description' in r) + self.assertTrue('cover_image_thumbnail' in r) self.assertTrue( r['cover_image_thumbnail'].startswith('https') or r['cover_image_thumbnail'].startswith('http') ) - self.assertTrue(r.has_key('publisher')) - self.assertTrue(r.has_key('isbn_13')) - self.assertTrue(r.has_key('googlebooks_id')) + self.assertTrue('publisher' in r) + self.assertTrue('isbn_13' in r) + self.assertTrue('googlebooks_id' in r) def test_pagination(self, mocking=False): if not (mocking or settings.TEST_INTEGRATION): @@ -682,7 +682,7 @@ def test_campaign_status(self): #cloning (note we changed c3 to w2 to make it clonable) c7 = c3.clone() self.assertEqual(c7.status, 'INITIALIZED') - self.assertEqual(c7.premiums.all()[0].description, 'botsnack') + self.assertEqual(c7.premiums.first().description, 'botsnack') # SUCCESSFUL @@ -774,7 +774,7 @@ def test_add_remove(self): user.wishlist.add_work(work, 'test') self.assertEqual(user.wishlist.works.count(), 1) self.assertEqual(work.num_wishes, num_wishes+1) - self.assertEqual(work.priority(),1) + self.assertEqual(work.priority(), 2) user.wishlist.remove_work(work) self.assertEqual(user.wishlist.works.count(), 0) self.assertEqual(work.num_wishes, num_wishes) @@ -790,43 +790,12 @@ def test_single_fac(self): def test_subtask(self): n = 30 subtasks = [tasks.fac.subtask(args=(x,)) for x in range(n)] - job = TaskSet(tasks=subtasks) + job = group(subtasks) result = job.apply_async() while not result.ready(): sleep(0.2) self.assertEqual(result.join(), [factorial(x) for x in range(n)]) -class GoodreadsTest(TestCase): - - @unittest.skip("Goodreads down at the moment") - def test_goodreads_shelves(self): - if not settings.GOODREADS_API_SECRET: - return - # test to see whether the core undeletable shelves are on the list - gr_uid = "767708" # for Raymond Yee - gc = goodreads.GoodreadsClient( - key=settings.GOODREADS_API_KEY, - secret=settings.GOODREADS_API_SECRET - ) - shelves = gc.shelves_list(gr_uid) - shelf_names = [s['name'] for s in shelves['user_shelves']] - self.assertTrue('currently-reading' in shelf_names) - self.assertTrue('read' in shelf_names) - self.assertTrue('to-read' in shelf_names) - - @unittest.skip("Goodreads down at the moment") - def test_review_list_unauth(self): - if not settings.GOODREADS_API_SECRET: - return - gr_uid = "767708" # for Raymond Yee - gc = goodreads.GoodreadsClient( - key=settings.GOODREADS_API_KEY, - secret=settings.GOODREADS_API_SECRET - ) - reviews = gc.review_list_unauth(user_id=gr_uid, shelf='read') - # test to see whether there is a book field in each of the review - # url for test is https://www.goodreads.com/review/list.xml?id=767708&shelf=read&page=1&per_page=20&order=a&v=2&key=[key] - self.assertTrue(all([r.has_key("book") for r in reviews])) class LibraryThingTest(TestCase): @@ -903,24 +872,12 @@ def test_ISBN(self): self.assertEqual(isbn.ISBN(python_13).validate(), python_10) # curious about set membership - self.assertEqual(len(set([isbn.ISBN(milosz_10), isbn.ISBN(milosz_13)])), 2) self.assertEqual(len(set([str(isbn.ISBN(milosz_10)), str(isbn.ISBN(milosz_13))])), 2) self.assertEqual( len(set([isbn.ISBN(milosz_10).to_string(), isbn.ISBN(milosz_13).to_string()])), 1 ) -class EncryptedKeyTest(TestCase): - def test_create_read_key(self): - name = "the great answer" - value = "42" - key = Key.objects.create(name=name, value=value) - key.save() - # do we get back the value? - self.assertEqual(Key.objects.filter(name=name)[0].value, value) - # just checking that the encrypted value is not the same as the value - self.assertNotEqual(key.encrypted_value, value) # is this always true? - class SafeGetWorkTest(TestCase): def test_good_work(self): w1 = models.Work() @@ -934,7 +891,7 @@ def test_good_work(self): self.assertEqual(work, w1) work = safe_get_work(w2_id) self.assertEqual(work, w1) - self.assertRaises(Http404, safe_get_work, 3) + self.assertRaises(Work.DoesNotExist, safe_get_work, 3) class WorkTests(TestCase): def setUp(self): @@ -988,23 +945,14 @@ def test_download_page(self): eb1.edition = e1 eb1.format = 'epub' - eb2 = models.Ebook() - eb2.url = "https://example2.com" - eb2.edition = e2 - eb2.format = 'mobi' eb1.save() - eb2.save() anon_client = Client() response = anon_client.get("/work/%s/download/" % w.id, follow=True) - self.assertContains(response, "/download_ebook/%s/"% eb1.id, count=11) - self.assertContains(response, "/download_ebook/%s/"% eb2.id, count=4) + self.assertContains(response, "/download_ebook/%s/"% eb1.id, count=12) self.assertTrue(eb1.edition.work.is_free) eb1.delete() - self.assertTrue(eb2.edition.work.is_free) - eb2.delete() - self.assertFalse(eb2.edition.work.is_free) class MailingListTests(TestCase): #mostly to check that MailChimp account is setp correctly @@ -1018,12 +966,30 @@ def test_mailchimp(self): self.user = User.objects.create_user('chimp_test', 'eric@gluejar.com', 'chimp_test') self.assertTrue(self.user.profile.on_ml) +class CoverTests(TestCase): + test_image = 'https://unglue.it/static/images/logo.png' + test_bad_image = 'https://example.com/static/images/logo.png' + def setUp(self): + self.work = Work.objects.create(title="Cover Work") + self.edition = Edition.objects.create(title=self.work.title, work=self.work) + covers.sorl_get_thumbnail(self.test_image, 'x550', crop='noop', quality=95) + + def test_cached_cover(self): + thumb = covers.get_thumbnail(self.test_image, 'x550', crop='noop', quality=95) + self.assertTrue(thumb.exists()) + self.assertTrue(thumb.width, 550) + + def test_bad_cover(self): + thumb = covers.get_thumbnail(self.test_bad_image, '128', crop='noop', quality=95) + self.assertEqual(thumb.url, covers.DEFAULT_COVER) + + @override_settings(LOCAL_TEST=True) class EbookFileTests(TestCase): fixtures = ['initial_data.json'] def test_badepub_errors(self): textfile = NamedTemporaryFile(delete=False) - textfile.write("bad text file") + textfile.write(b"bad text file") textfile.seek(0) self.assertTrue(test_epub(textfile)) @@ -1039,9 +1005,9 @@ def test_ebookfile(self): c = Campaign.objects.create( work=w, type=parameters.BUY2UNGLUE, - cc_date_initial=datetime(2020, 1, 1), + cc_date_initial=datetime(2030, 1, 1), target=1000, - deadline=datetime(2020, 1, 1), + deadline=datetime(2030, 1, 1), license='CC BY', description="dummy description", ) @@ -1054,7 +1020,7 @@ def test_ebookfile(self): try: # now we can try putting the test epub file into Django storage - temp_file = open(temp.name) + temp_file = open(temp.name, 'rb') dj_file = DjangoFile(temp_file) ebf = EbookFile(format='epub', edition=e, file=dj_file) @@ -1069,7 +1035,7 @@ def test_ebookfile(self): self.assertEqual(len(test_epub.opf), 4) self.assertTrue(len(test_epub.opf[2]) < 30) - acq = Acq.objects.create(user=u,work=w,license=TESTING) + acq = Acq.objects.create(user=u, work=w, license=TESTING) self.assertIsNot(acq.nonce, None) url = acq.get_watermarked().download_link_epub @@ -1084,7 +1050,7 @@ def test_ebookfile(self): #flip the campaign to success c.cc_date_initial = datetime(2012, 1, 1) c.update_status() - self.assertEqual(c.work.ebooks().count(), 2) + self.assertEqual(c.work.ebooks().count(), 1) c.do_watermark = False c.save() url = acq.get_watermarked().download_link_epub @@ -1108,7 +1074,7 @@ def test_ebookfile_thanks(self): temp.close() try: # now we can try putting the test pdf file into Django storage - temp_file = open(temp.name) + temp_file = open(temp.name, 'rb') dj_file = DjangoFile(temp_file) ebf = EbookFile(format='pdf', edition=e, file=dj_file) @@ -1139,7 +1105,7 @@ def test_ebookfile_thanks(self): temp.close() try: # now we can try putting the test pdf file into Django storage - temp_file = open(temp.name) + temp_file = open(temp.name, 'rb') dj_file = DjangoFile(temp_file) ebf = EbookFile(format='epub', edition=e, file=dj_file) @@ -1149,15 +1115,12 @@ def test_ebookfile_thanks(self): ebf.ebook = eb ebf.save() temp_file.close() - ebf.make_mobi() finally: # make sure we get rid of temp file os.remove(temp.name) #test the ask-appender c.add_ask_to_ebfs() self.assertTrue(c.work.ebookfiles().filter(asking=True, format='epub').count() > 0) - if settings.MOBIGEN_URL: - self.assertTrue(c.work.ebookfiles().filter(asking=True, format='mobi').count() > 0) self.assertTrue(c.work.ebookfiles().filter(asking=True, ebook__active=True).count() > 0) self.assertTrue(c.work.ebookfiles().filter(asking=False, ebook__active=True).count() == 0) #test the unasker @@ -1170,35 +1133,22 @@ def test_bad_ebookfile(self): e = Edition.objects.create(title=w.title, work=w) temp = NamedTemporaryFile(delete=False) - test_file_content = "bad text file" + test_file_content = b"bad text file" temp.write(test_file_content) temp.close() try: # put the bad file into Django storage - temp_file = open(temp.name) + temp_file = open(temp.name, 'rb') dj_file = DjangoFile(temp_file) ebf = EbookFile(format='epub', edition=e, file=dj_file) ebf.save() temp_file.close() - ebf.make_mobi() finally: # make sure we get rid of temp file os.remove(temp.name) - self.assertTrue(ebf.mobied < 0) -class MobigenTests(TestCase): - def test_convert_to_mobi(self): - """ - check the size of the mobi output of a Moby Dick epub - """ - from regluit.core.mobigen import convert_to_mobi - if settings.TEST_INTEGRATION: - output = convert_to_mobi( - "https://github.com/GITenberg/Moby-Dick--Or-The-Whale_2701/releases/download/0.2.0/Moby-Dick-Or-The-Whale.epub" - ) - self.assertTrue(len(output) > 2207877) @override_settings(LOCAL_TEST=True) class LibTests(TestCase): @@ -1211,6 +1161,7 @@ def test_purchase(self): e = Edition.objects.create(title=w.title, work=w) u = User.objects.create_user('test', 'test@example.org', 'testpass') lu = User.objects.create_user('library', 'testu@example.org', 'testpass') + Library = apps.get_model('libraryauth', 'Library') lib = Library.objects.create(user=lu, owner=u) c = Campaign.objects.create( work=w, @@ -1251,11 +1202,11 @@ def test_ebooks_in_github_release(self): ) expected_set = set([ ('epub', u'Adventures-of-Huckleberry-Finn.epub'), - ('mobi', u'Adventures-of-Huckleberry-Finn.mobi'), ('pdf', u'Adventures-of-Huckleberry-Finn.pdf') ]) - self.assertEqual(set(ebooks), expected_set) + self.assertTrue(('epub', 'Adventures-of-Huckleberry-Finn.epub') in set(ebooks)) + self.assertTrue(('pdf', 'Adventures-of-Huckleberry-Finn.pdf') in set(ebooks)) class OnixLoaderTests(TestCase): fixtures = ['initial_data.json'] diff --git a/core/validation.py b/core/validation.py index aed5a3fdd..26b312c5f 100644 --- a/core/validation.py +++ b/core/validation.py @@ -4,16 +4,19 @@ ''' import re import datetime +import logging from dateutil.parser import parse -from PyPDF2 import PdfFileReader +from pypdf import PdfReader from django.forms import ValidationError from django.utils.translation import ugettext_lazy as _ -from regluit.pyepub import EPUB -from regluit.mobi import Mobi +from pyepub import EPUB from .isbn import ISBN +from regluit.utils.text import remove_author_junk + +logger = logging.getLogger(__name__) ID_VALIDATION = { 'http': (re.compile(r"(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$", @@ -21,8 +24,8 @@ "The Web Address must be a valid http(s) URL."), 'isbn': (u'^([\\dxX \\-–—‐,;]+|delete)$', #includes unicode hyphen, endash and emdash "The ISBN must be a valid ISBN-13."), - 'doab': (r'^(\d{1,6}|delete)$', - "The value must be 1-6 digits."), + 'doab': (r'^20.500.12854/(\d{5,8}|delete)$', + "The value must be a handle, starting with 20.500.12854/, followed by 5-8 digits."), 'gtbg': (r'^(\d{1,6}|delete)$', "The Gutenberg number must be 1-6 digits."), 'doi': (r'^(https?://dx\.doi\.org/|https?://doi\.org/)?(10\.\d+/\S+|delete)$', @@ -41,6 +44,8 @@ "The Unglue.it ID must be 1-6 digits."), 'ltwk': (r'^(\d{1,8}|delete)$', "The LibraryThing work ID must be 1-8 digits."), + 'oapn': (r'^(\d{1,8}|delete)$', + "The OAPEN ID must be 1-8 digits."), } def isbn_cleaner(value): @@ -76,10 +81,10 @@ def doi_cleaner(value): } def identifier_cleaner(id_type, quiet=False): - if ID_VALIDATION.has_key(id_type): + if id_type in ID_VALIDATION: (regex, err_msg) = ID_VALIDATION[id_type] extra = ID_MORE_VALIDATION.get(id_type, None) - if isinstance(regex, (str, unicode)): + if isinstance(regex, str): regex = re.compile(regex) def cleaner(value): if not value: @@ -105,17 +110,13 @@ def test_file(the_file, fformat): try: book = EPUB(the_file.file) except Exception as e: + logger.exception(e) raise ValidationError(_('Are you sure this is an EPUB file?: %s' % e)) - elif fformat == 'mobi': - try: - book = Mobi(the_file.file) - book.parse() - except Exception as e: - raise ValidationError(_('Are you sure this is a MOBI file?: %s' % e)) elif fformat == 'pdf': try: - PdfFileReader(the_file.file) - except Exception, e: + PdfReader(the_file.file) + except Exception as e: + logger.exception(e) raise ValidationError(_('%s is not a valid PDF file' % the_file.name)) return True @@ -146,6 +147,25 @@ def valid_subject(subject_name): return False return True +def explode_bic(subject_name): + subjects = [] + if subject_name.startswith('bic Book Industry Communication::'): + for sub in subject_name.split('::')[1:]: + try: + subjects.append(sub.strip().split(maxsplit=1)[1]) + except IndexError: + continue + else: + subjects = [subject_name] + return subjects + +def explode_bics(subjects): + exploded = [] + for s in subjects: + exploded.extend(explode_bic(s)) + return exploded + + reverse_name_comma = re.compile(r',(?! *Jr[\., ])') def unreverse_name(name): @@ -188,6 +208,7 @@ def auth_cleaner(auth): auth = _and_.sub(',', auth) authlist = comma_list_delim.split(auth) for auth in authlist: + auth = remove_author_junk(auth) cleaned.append(spaces.sub(' ', auth.strip())) return cleaned diff --git a/core/views.py b/core/views.py index 3f44d7bca..00f0f60c4 100755 --- a/core/views.py +++ b/core/views.py @@ -12,12 +12,12 @@ def test_read(request): row_id = 1 - print "Attempting to read row" + print("Attempting to read row") # A read the waits for the exclusive lock for the row campaign = Campaign.objects.raw("SELECT * FROM core_campaign WHERE id=%d FOR UPDATE" % row_id)[0] - print "Successfully read row data %d" % campaign.target + print("Successfully read row data %d" % campaign.target) except: traceback.print_exc() @@ -31,7 +31,7 @@ def test_write(request): row_id = 1 campaign = Campaign.objects.get(id=row_id) - print "Attempting to write row via ordinary ORM" + print("Attempting to write row via ordinary ORM") # # Modify the data. This will block if any shared lock (Either FOR UPDATE or LOCK IN SHARED MODE is held @@ -39,7 +39,7 @@ def test_write(request): campaign.target = campaign.target + 10 campaign.save() - print "Successfully write new row data %d" % campaign.target + print("Successfully write new row data %d" % campaign.target) except: traceback.print_exc() @@ -52,11 +52,11 @@ def test_lock(request): try: row_id = 1 - print "Attempting to acquire row lock" + print("Attempting to acquire row lock") campaign = Campaign.objects.raw("SELECT * FROM core_campaign WHERE id=%d FOR UPDATE" % row_id)[0] - print "Row lock acquired, modifying data" + print("Row lock acquired, modifying data") # Modify the data campaign.target = campaign.target + 10 @@ -72,9 +72,9 @@ def test_lock(request): # As soon as the function completes, the transaction will be committed and the lock released. # You can modify the commit_on_success decorator to get different transaction behaviors # - print "Thread sleeping for 10 seconds" + print("Thread sleeping for 10 seconds") time.sleep(10) - print "Thread sleep complete" + print("Thread sleep complete") except: traceback.print_exc() diff --git a/deploy/celerybeat b/deploy/celerybeat deleted file mode 100644 index 34b9ad6a4..000000000 --- a/deploy/celerybeat +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -# ========================================================= -# celerybeat - Starts the Celery periodic task scheduler. -# ========================================================= -# -# :Usage: /etc/init.d/celerybeat {start|stop|force-reload|restart|try-restart|status} -# :Configuration file: /etc/default/celerybeat or /etc/default/celeryd -# -# See http://docs.celeryq.org/en/latest/cookbook/daemonizing.html#init-script-celerybeat -# This file is copied from https://github.com/ask/celery/blob/2.4/contrib/generic-init.d/celerybeat - -### BEGIN INIT INFO -# Provides: celerybeat -# Required-Start: $network $local_fs $remote_fs -# Required-Stop: $network $local_fs $remote_fs -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: celery periodic task scheduler -### END INIT INFO - -# Cannot use set -e/bash -e since the kill -0 command will abort -# abnormally in the absence of a valid process ID. -#set -e - -DEFAULT_PID_FILE="/var/run/celerybeat.pid" -DEFAULT_LOG_FILE="/var/log/celerybeat.log" -DEFAULT_LOG_LEVEL="INFO" -DEFAULT_CELERYBEAT="celerybeat" - -# /etc/init.d/ssh: start and stop the celery task worker daemon. - -if test -f /etc/default/celeryd; then - . /etc/default/celeryd -fi - -if test -f /etc/default/celerybeat; then - . /etc/default/celerybeat -fi - -CELERYBEAT=${CELERYBEAT:-$DEFAULT_CELERYBEAT} -CELERYBEAT_PID_FILE=${CELERYBEAT_PID_FILE:-${CELERYBEAT_PIDFILE:-$DEFAULT_PID_FILE}} -CELERYBEAT_LOG_FILE=${CELERYBEAT_LOG_FILE:-${CELERYBEAT_LOGFILE:-$DEFAULT_LOG_FILE}} -CELERYBEAT_LOG_LEVEL=${CELERYBEAT_LOG_LEVEL:-${CELERYBEAT_LOGLEVEL:-$DEFAULT_LOG_LEVEL}} - -export CELERY_LOADER - -CELERYBEAT_OPTS="$CELERYBEAT_OPTS -f $CELERYBEAT_LOG_FILE -l $CELERYBEAT_LOG_LEVEL" - -if [ -n "$2" ]; then - CELERYBEAT_OPTS="$CELERYBEAT_OPTS $2" -fi - -CELERYBEAT_LOG_DIR=`dirname $CELERYBEAT_LOG_FILE` -CELERYBEAT_PID_DIR=`dirname $CELERYBEAT_PID_FILE` -if [ ! -d "$CELERYBEAT_LOG_DIR" ]; then - mkdir -p $CELERYBEAT_LOG_DIR -fi -if [ ! -d "$CELERYBEAT_PID_DIR" ]; then - mkdir -p $CELERYBEAT_PID_DIR -fi - -# Extra start-stop-daemon options, like user/group. -if [ -n "$CELERYBEAT_USER" ]; then - DAEMON_OPTS="$DAEMON_OPTS --uid $CELERYBEAT_USER" - chown "$CELERYBEAT_USER" $CELERYBEAT_LOG_DIR $CELERYBEAT_PID_DIR -fi -if [ -n "$CELERYBEAT_GROUP" ]; then - DAEMON_OPTS="$DAEMON_OPTS --gid $CELERYBEAT_GROUP" - chgrp "$CELERYBEAT_GROUP" $CELERYBEAT_LOG_DIR $CELERYBEAT_PID_DIR -fi - -CELERYBEAT_CHDIR=${CELERYBEAT_CHDIR:-$CELERYD_CHDIR} -if [ -n "$CELERYBEAT_CHDIR" ]; then - DAEMON_OPTS="$DAEMON_OPTS --workdir $CELERYBEAT_CHDIR" -fi - - -export PATH="${PATH:+$PATH:}/usr/sbin:/sbin" - -check_dev_null() { - if [ ! -c /dev/null ]; then - echo "/dev/null is not a character device!" - exit 1 - fi -} - -wait_pid () { - pid=$1 - forever=1 - i=0 - while [ $forever -gt 0 ]; do - kill -0 $pid 1>/dev/null 2>&1 - if [ $? -eq 1 ]; then - echo "OK" - forever=0 - else - kill -TERM "$pid" - i=$((i + 1)) - if [ $i -gt 60 ]; then - echo "ERROR" - echo "Timed out while stopping (30s)" - forever=0 - else - sleep 0.5 - fi - fi - done -} - - -stop_beat () { - echo -n "Stopping celerybeat... " - if [ -f "$CELERYBEAT_PID_FILE" ]; then - wait_pid $(cat "$CELERYBEAT_PID_FILE") - else - echo "NOT RUNNING" - fi -} - -start_beat () { - echo "Starting celerybeat..." - if [ -n "$VIRTUALENV" ]; then - source $VIRTUALENV/bin/activate - fi - $CELERYBEAT $CELERYBEAT_OPTS $DAEMON_OPTS --detach \ - --pidfile="$CELERYBEAT_PID_FILE" -} - - - -case "$1" in - start) - check_dev_null - start_beat - ;; - stop) - stop_beat - ;; - reload|force-reload) - echo "Use start+stop" - ;; - restart) - echo "Restarting celery periodic task scheduler" - stop_beat - check_dev_null - start_beat - ;; - - *) - echo "Usage: /etc/init.d/celerybeat {start|stop|restart}" - exit 1 -esac - -exit 0 \ No newline at end of file diff --git a/deploy/celerybeat_just.conf b/deploy/celerybeat_just.conf deleted file mode 100644 index 2e82b9b7d..000000000 --- a/deploy/celerybeat_just.conf +++ /dev/null @@ -1,36 +0,0 @@ -# http://docs.celeryproject.org/en/latest/cookbook/daemonizing.html#generic-initd-celerybeat-example -# to be placed at /etc/defaults/celerybeat - -# Where to chdir at start. -CELERYBEAT_CHDIR="/opt/regluit/" - -# Extra arguments to celerybeat -#CELERYBEAT_OPTS="--schedule=/var/run/celerybeat-schedule" - -# Name of the celery config module.# -CELERY_CONFIG_MODULE="celeryconfig" - -# Name of the projects settings module. -export DJANGO_SETTINGS_MODULE="regluit.settings.just" - -# Path to celerybeat -CELERYBEAT="/opt/regluit/ENV/bin/django-admin.py celerybeat" - -# virtualenv to use -VIRTUALENV="/opt/regluit/ENV" - -#Full path to the PID file. Default is /var/run/celeryd.pid -CELERYBEAT_PIDFILE="/var/log/celerybeat/celerybeat.pid" - -#Full path to the celeryd log file. Default is /var/log/celeryd.log -CELERYBEAT_LOGFILE="/var/log/celerybeat/celerybeat.log" - -#Log level to use for celeryd. Default is INFO. -CELERYBEAT_LOG_LEVEL="INFO" - - -#User to run celeryd as. Default is current user. -#CELERYBEAT_USER - -#Group to run celeryd as. Default is current user. -#CELERYBEAT_GROUP diff --git a/deploy/celerybeat_localvm.conf b/deploy/celerybeat_localvm.conf deleted file mode 100644 index 4b61ec86c..000000000 --- a/deploy/celerybeat_localvm.conf +++ /dev/null @@ -1,36 +0,0 @@ -# http://docs.celeryproject.org/en/latest/cookbook/daemonizing.html#generic-initd-celerybeat-example -# to be placed at /etc/defaults/celerybeat - -# Where to chdir at start. -CELERYBEAT_CHDIR="/opt/regluit/" - -# Extra arguments to celerybeat -#CELERYBEAT_OPTS="--schedule=/var/run/celerybeat-schedule" - -# Name of the celery config module.# -CELERY_CONFIG_MODULE="celeryconfig" - -# Name of the projects settings module. -export DJANGO_SETTINGS_MODULE="regluit.settings.localvm" - -# Path to celerybeat -CELERYBEAT="/opt/regluit/ENV/bin/django-admin.py celerybeat" - -# virtualenv to use -VIRTUALENV="/opt/regluit/ENV" - -#Full path to the PID file. Default is /var/run/celeryd.pid -CELERYBEAT_PIDFILE="/var/log/celerybeat/celerybeat.pid" - -#Full path to the celeryd log file. Default is /var/log/celeryd.log -CELERYBEAT_LOGFILE="/var/log/celerybeat/celerybeat.log" - -#Log level to use for celeryd. Default is INFO. -CELERYBEAT_LOG_LEVEL="INFO" - - -#User to run celeryd as. Default is current user. -#CELERYBEAT_USER - -#Group to run celeryd as. Default is current user. -#CELERYBEAT_GROUP diff --git a/deploy/celerybeat_please.conf b/deploy/celerybeat_please.conf deleted file mode 100644 index af4b9a92e..000000000 --- a/deploy/celerybeat_please.conf +++ /dev/null @@ -1,36 +0,0 @@ -# http://docs.celeryproject.org/en/latest/cookbook/daemonizing.html#generic-initd-celerybeat-example -# to be placed at /etc/defaults/celerybeat - -# Where to chdir at start. -CELERYBEAT_CHDIR="/opt/regluit/" - -# Extra arguments to celerybeat -#CELERYBEAT_OPTS="--schedule=/var/run/celerybeat-schedule" - -# Name of the celery config module.# -CELERY_CONFIG_MODULE="celeryconfig" - -# Name of the projects settings module. -export DJANGO_SETTINGS_MODULE="regluit.settings.please" - -# Path to celerybeat -CELERYBEAT="/opt/regluit/ENV/bin/django-admin.py celerybeat" - -# virtualenv to use -VIRTUALENV="/opt/regluit/ENV" - -#Full path to the PID file. Default is /var/run/celeryd.pid -CELERYBEAT_PIDFILE="/var/log/celerybeat/celerybeat.pid" - -#Full path to the celeryd log file. Default is /var/log/celeryd.log -CELERYBEAT_LOGFILE="/var/log/celerybeat/celerybeat.log" - -#Log level to use for celeryd. Default is INFO. -CELERYBEAT_LOG_LEVEL="INFO" - - -#User to run celeryd as. Default is current user. -#CELERYBEAT_USER - -#Group to run celeryd as. Default is current user. -#CELERYBEAT_GROUP diff --git a/deploy/celerybeat_prod.conf b/deploy/celerybeat_prod.conf deleted file mode 100644 index aa0e7f941..000000000 --- a/deploy/celerybeat_prod.conf +++ /dev/null @@ -1,35 +0,0 @@ -# http://docs.celeryproject.org/en/latest/cookbook/daemonizing.html#generic-initd-celerybeat-example -# to be placed at /etc/defaults/celerybeat - -# Where to chdir at start. -CELERYBEAT_CHDIR="/opt/regluit/" - -# Extra arguments to celerybeat -#CELERYBEAT_OPTS="--schedule=/var/run/celerybeat-schedule" - -# Name of the celery config module.# -CELERY_CONFIG_MODULE="celeryconfig" - -# Name of the projects settings module. -export DJANGO_SETTINGS_MODULE="regluit.settings.prod" - -# Path to celerybeat -CELERYBEAT="/opt/regluit/ENV/bin/django-admin.py celerybeat" - -# virtualenv to use -VIRTUALENV="/opt/regluit/ENV" - -#Full path to the PID file. Default is /var/run/celeryd.pid -CELERYBEAT_PIDFILE="/var/log/celerybeat/celerybeat.pid" - -#Full path to the celeryd log file. Default is /var/log/celeryd.log -CELERYBEAT_LOGFILE="/var/log/celerybeat/celerybeat.log" - -#Log level to use for celeryd. Default is INFO. -CELERYBEAT_LOG_LEVEL="INFO" - -#User to run celeryd as. Default is current user. -#CELERYBEAT_USER - -#Group to run celeryd as. Default is current user. -#CELERYBEAT_GROUP diff --git a/deploy/celerybeat_rydev.conf b/deploy/celerybeat_rydev.conf deleted file mode 100644 index fbde6c92d..000000000 --- a/deploy/celerybeat_rydev.conf +++ /dev/null @@ -1,36 +0,0 @@ -# http://docs.celeryproject.org/en/latest/cookbook/daemonizing.html#generic-initd-celerybeat-example -# to be placed at /etc/defaults/celerybeat - -# Where to chdir at start. -CELERYBEAT_CHDIR="/home/ubuntu/regluit/" - -# Extra arguments to celerybeat -#CELERYBEAT_OPTS="--schedule=/var/run/celerybeat-schedule" - -# Name of the celery config module.# -CELERY_CONFIG_MODULE="celeryconfig" - -# Name of the projects settings module. -export DJANGO_SETTINGS_MODULE="regluit.settings.me" - -# Path to celerybeat -CELERYBEAT="/home/ubuntu/.virtualenvs/regluit/bin/django-admin.py celerybeat" - -# virtualenv to use -VIRTUALENV="/home/ubuntu/.virtualenvs/regluit" - -#Full path to the PID file. Default is /var/run/celeryd.pid -CELERYBEAT_PIDFILE="/var/log/celerybeat/celerybeat.pid" - -#Full path to the celeryd log file. Default is /var/log/celeryd.log -CELERYBEAT_LOGFILE="/var/log/celerybeat/celerybeat.log" - -#Log level to use for celeryd. Default is INFO. -CELERYBEAT_LOG_LEVEL="INFO" - - -#User to run celeryd as. Default is current user. -#CELERYBEAT_USER - -#Group to run celeryd as. Default is current user. -#CELERYBEAT_GROUP diff --git a/deploy/celeryd b/deploy/celeryd deleted file mode 100644 index 12ff8445e..000000000 --- a/deploy/celeryd +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash -# ============================================ -# celeryd - Starts the Celery worker daemon. -# ============================================ -# -# :Usage: /etc/init.d/celeryd {start|stop|force-reload|restart|try-restart|status} -# -# :Configuration file: /etc/default/celeryd -# -# To configure celeryd you probably need to tell it where to chdir. -# -# EXAMPLE CONFIGURATION -# ===================== -# -# this is an example configuration for a Python project: -# -# /etc/default/celeryd: -# -# # List of nodes to start -# CELERYD_NODES="worker1 worker2 worker3"k -# # ... can also be a number of workers -# CELERYD_NODES=3 -# -# # Where to chdir at start. -# CELERYD_CHDIR="/opt/Myproject/" -# -# # Extra arguments to celeryd -# CELERYD_OPTS="--time-limit=300" -# -# # Name of the celery config module.# -# CELERY_CONFIG_MODULE="celeryconfig" -# -# EXAMPLE DJANGO CONFIGURATION -# ============================ -# -# # Where the Django project is. -# CELERYD_CHDIR="/opt/Project/" -# -# # Name of the projects settings module. -# export DJANGO_SETTINGS_MODULE="settings" -# -# # Path to celeryd -# CELERYD="/opt/Project/manage.py celeryd" -# -# AVAILABLE OPTIONS -# ================= -# -# * CELERYD_NODES -# -# A space separated list of nodes, or a number describing the number of -# nodes, to start -# -# * CELERYD_OPTS -# Additional arguments to celeryd-multi, see `celeryd-multi --help` -# and `celeryd --help` for help. -# -# * CELERYD_CHDIR -# Path to chdir at start. Default is to stay in the current directory. -# -# * CELERYD_PIDFILE -# Full path to the pidfile. Default is /var/run/celeryd.pid. -# -# * CELERYD_LOGFILE -# Full path to the celeryd logfile. Default is /var/log/celeryd.log -# -# * CELERYD_LOG_LEVEL -# Log level to use for celeryd. Default is INFO. -# -# * CELERYD -# Path to the celeryd program. Default is `celeryd`. -# You can point this to an virtualenv, or even use manage.py for django. -# -# * CELERYD_USER -# User to run celeryd as. Default is current user. -# -# * CELERYD_GROUP -# Group to run celeryd as. Default is current user. - -# VARIABLE EXPANSION -# ================== -# -# The following abbreviations will be expanded -# -# * %n -> node name -# * %h -> host name - - -### BEGIN INIT INFO -# Provides: celeryd -# Required-Start: $network $local_fs $remote_fs -# Required-Stop: $network $local_fs $remote_fs -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: celery task worker daemon -### END INIT INFO - -#set -e - -DEFAULT_PID_FILE="/var/run/celeryd@%n.pid" -DEFAULT_LOG_FILE="/var/log/celeryd@%n.log" -DEFAULT_LOG_LEVEL="INFO" -DEFAULT_NODES="celery" -DEFAULT_CELERYD="-m celery.bin.celeryd_detach" - -# /etc/init.d/celeryd: start and stop the celery task worker daemon. - -CELERY_DEFAULTS=${CELERY_DEFAULTS:-"/etc/default/celeryd"} - -test -f "$CELERY_DEFAULTS" && . "$CELERY_DEFAULTS" -if [ -f "/etc/default/celeryd" ]; then - . /etc/default/celeryd -fi - -if [ -f $VIRTUALENV_ACTIVATE ]; then - echo "activating virtualenv $VIRTUALENV_ACTIVATE" - source "$VIRTUALENV_ACTIVATE" -fi - -CELERYD_PID_FILE=${CELERYD_PID_FILE:-${CELERYD_PIDFILE:-$DEFAULT_PID_FILE}} -CELERYD_LOG_FILE=${CELERYD_LOG_FILE:-${CELERYD_LOGFILE:-$DEFAULT_LOG_FILE}} -CELERYD_LOG_LEVEL=${CELERYD_LOG_LEVEL:-${CELERYD_LOGLEVEL:-$DEFAULT_LOG_LEVEL}} -CELERYD_MULTI=${CELERYD_MULTI:-"celeryd-multi"} -CELERYD=${CELERYD:-$DEFAULT_CELERYD} -CELERYD_NODES=${CELERYD_NODES:-$DEFAULT_NODES} - -export CELERY_LOADER - -if [ -n "$2" ]; then - CELERYD_OPTS="$CELERYD_OPTS $2" -fi - -# Extra start-stop-daemon options, like user/group. -if [ -n "$CELERYD_USER" ]; then - DAEMON_OPTS="$DAEMON_OPTS --uid=$CELERYD_USER" -fi -if [ -n "$CELERYD_GROUP" ]; then - DAEMON_OPTS="$DAEMON_OPTS --gid=$CELERYD_GROUP" -fi - -if [ -n "$CELERYD_CHDIR" ]; then - DAEMON_OPTS="$DAEMON_OPTS --workdir=\"$CELERYD_CHDIR\"" -fi - - -check_dev_null() { - if [ ! -c /dev/null ]; then - echo "/dev/null is not a character device!" - exit 1 - fi -} - - -export PATH="${PATH:+$PATH:}/usr/sbin:/sbin" - - -stop_workers () { - $CELERYD_MULTI stop $CELERYD_NODES --pidfile="$CELERYD_PID_FILE" -} - - -start_workers () { - $CELERYD_MULTI start $CELERYD_NODES $DAEMON_OPTS \ - --pidfile="$CELERYD_PID_FILE" \ - --logfile="$CELERYD_LOG_FILE" \ - --loglevel="$CELERYD_LOG_LEVEL" \ - --cmd="$CELERYD" \ - $CELERYD_OPTS -} - - -restart_workers () { - $CELERYD_MULTI restart $CELERYD_NODES $DAEMON_OPTS \ - --pidfile="$CELERYD_PID_FILE" \ - --logfile="$CELERYD_LOG_FILE" \ - --loglevel="$CELERYD_LOG_LEVEL" \ - --cmd="$CELERYD" \ - $CELERYD_OPTS -} - - - -case "$1" in - start) - check_dev_null - start_workers - ;; - - stop) - check_dev_null - stop_workers - ;; - - reload|force-reload) - echo "Use restart" - ;; - - status) - celeryctl status - ;; - - restart) - check_dev_null - restart_workers - ;; - - try-restart) - check_dev_null - restart_workers - ;; - - *) - echo "Usage: /etc/init.d/celeryd {start|stop|restart|try-restart|kill}" - exit 1 - ;; -esac - -exit 0 diff --git a/deploy/celeryd_just.conf b/deploy/celeryd_just.conf deleted file mode 100644 index 3e1ee05e2..000000000 --- a/deploy/celeryd_just.conf +++ /dev/null @@ -1,11 +0,0 @@ -CELERYD_NODES="w1" -CELERYD_CHDIR="/opt/regluit/" -CELERYD_LOG_FILE="/var/log/celery/%n.log" -CELERYD_PID_FILE="/var/log/celery/%n.pid" -CELERYD_USER="celery" -CELERYD_GROUP="celery" -CELERYD="/opt/regluit/ENV/bin/django-admin.py celeryd" -CELERYD_MULTI="/opt/regluit/ENV/bin/django-admin.py celeryd_multi" - -VIRTUALENV_ACTIVATE="/opt/regluit/ENV/bin/activate" -export DJANGO_SETTINGS_MODULE="regluit.settings.just" diff --git a/deploy/celeryd_localvm.conf b/deploy/celeryd_localvm.conf deleted file mode 100644 index b0ba83243..000000000 --- a/deploy/celeryd_localvm.conf +++ /dev/null @@ -1,11 +0,0 @@ -CELERYD_NODES="w1" -CELERYD_CHDIR="/opt/regluit/" -CELERYD_LOG_FILE="/var/log/celery/%n.log" -CELERYD_PID_FILE="/var/log/celery/%n.pid" -CELERYD_USER="celery" -CELERYD_GROUP="celery" -CELERYD="/opt/regluit/ENV/bin/django-admin.py celeryd" -CELERYD_MULTI="/opt/regluit/ENV/bin/django-admin.py celeryd_multi" - -VIRTUALENV_ACTIVATE="/opt/regluit/ENV/bin/activate" -export DJANGO_SETTINGS_MODULE="regluit.settings.localvm" diff --git a/deploy/celeryd_please.conf b/deploy/celeryd_please.conf deleted file mode 100644 index 8c41c95ac..000000000 --- a/deploy/celeryd_please.conf +++ /dev/null @@ -1,11 +0,0 @@ -CELERYD_NODES="w1" -CELERYD_CHDIR="/opt/regluit/" -CELERYD_LOG_FILE="/var/log/celery/%n.log" -CELERYD_PID_FILE="/var/log/celery/%n.pid" -CELERYD_USER="celery" -CELERYD_GROUP="celery" -CELERYD="/opt/regluit/ENV/bin/django-admin.py celeryd" -CELERYD_MULTI="/opt/regluit/ENV/bin/django-admin.py celeryd_multi" - -VIRTUALENV_ACTIVATE="/opt/regluit/ENV/bin/activate" -export DJANGO_SETTINGS_MODULE="regluit.settings.please" diff --git a/deploy/celeryd_prod.conf b/deploy/celeryd_prod.conf deleted file mode 100644 index 476e607fc..000000000 --- a/deploy/celeryd_prod.conf +++ /dev/null @@ -1,11 +0,0 @@ -CELERYD_NODES="w1" -CELERYD_CHDIR="/opt/regluit/" -CELERYD_LOG_FILE="/var/log/celery/%n.log" -CELERYD_PID_FILE="/var/log/celery/%n.pid" -CELERYD_USER="celery" -CELERYD_GROUP="celery" -CELERYD="/opt/regluit/ENV/bin/django-admin.py celeryd" -CELERYD_MULTI="/opt/regluit/ENV/bin/django-admin.py celeryd_multi" - -VIRTUALENV_ACTIVATE="/opt/regluit/ENV/bin/activate" -export DJANGO_SETTINGS_MODULE="regluit.settings.prod" diff --git a/deploy/celeryd_rydev.conf b/deploy/celeryd_rydev.conf deleted file mode 100755 index 9c89e6ece..000000000 --- a/deploy/celeryd_rydev.conf +++ /dev/null @@ -1,11 +0,0 @@ -CELERYD_NODES="w1" -CELERYD_CHDIR="/home/ubuntu/regluit/" -CELERYD_LOG_FILE="/var/log/celery/%n.log" -CELERYD_PID_FILE="/var/log/celery/%n.pid" -CELERYD_USER="celery" -CELERYD_GROUP="celery" -CELERYD="/home/ubuntu/.virtualenvs/regluit/bin/django-admin.py celeryd" -CELERYD_MULTI="/home/ubuntu/.virtualenvs/regluit/bin/django-admin.py celeryd_multi" - -VIRTUALENV_ACTIVATE="/home/ubuntu/.virtualenvs/regluit/bin/activate" -export DJANGO_SETTINGS_MODULE="regluit.settings.me" diff --git a/deploy/crontab_just.txt b/deploy/crontab_just.txt deleted file mode 100644 index 4b8386fec..000000000 --- a/deploy/crontab_just.txt +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -* * * * * cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py emit_notices --settings=regluit.settings.just > /opt/regluit/deploy/emit_notices.log 2>&1 ; touch /opt/regluit/deploy/last-cron -@reboot sudo mkdir /var/run/celery; sudo chown celery:celery /var/log/celery /var/run/celery; cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py celeryd_multi restart w1 --settings=regluit.settings.just; /etc/init.d/celerybeat restart; diff --git a/deploy/crontab_localvm.txt b/deploy/crontab_localvm.txt deleted file mode 100644 index ef35f2a16..000000000 --- a/deploy/crontab_localvm.txt +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -* * * * * cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py emit_notices --settings=regluit.settings.localvm > /opt/regluit/deploy/emit_notices.log 2>&1 ; touch /opt/regluit/deploy/last-cron -@reboot sudo mkdir /var/run/celery; sudo chown celery:celery /var/log/celery /var/run/celery; cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py celeryd_multi restart w1 --settings=regluit.settings.localvm; /etc/init.d/celerybeat restart; \ No newline at end of file diff --git a/deploy/crontab_please.txt b/deploy/crontab_please.txt deleted file mode 100644 index bd975eec6..000000000 --- a/deploy/crontab_please.txt +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -* * * * * cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py emit_notices --settings=regluit.settings.please > /opt/regluit/deploy/emit_notices.log 2>&1 ; touch /opt/regluit/deploy/last-cron -@reboot sudo mkdir /var/run/celery; sudo chown celery:celery /var/log/celery /var/run/celery; cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py celeryd_multi restart w1 --settings=regluit.settings.please; /etc/init.d/celerybeat restart; diff --git a/deploy/crontab_prod.txt b/deploy/crontab_prod.txt deleted file mode 100644 index ca416808d..000000000 --- a/deploy/crontab_prod.txt +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -* * * * * /opt/regluit/deploy/emit_notices.sh -@reboot sudo mkdir /var/run/celery; sudo chown celery:celery /var/log/celery /var/run/celery; cd /opt/regluit; . /opt/regluit/ENV/bin/activate; /opt/regluit/ENV/bin/django-admin.py celeryd_multi restart w1 --settings=regluit.settings.prod; /etc/init.d/celerybeat restart; diff --git a/deploy/localvm.conf b/deploy/localvm.conf deleted file mode 100644 index e862ef813..000000000 --- a/deploy/localvm.conf +++ /dev/null @@ -1,59 +0,0 @@ -WSGIPythonHome /opt/regluit/ENV -WSGISocketPrefix /opt/regluit - -<VirtualHost *:80> - -ServerName localvm -ServerAdmin info@ebookfoundation.org - -Redirect permanent / https://192.168.33.10.xip.io:443/ - -</VirtualHost> - -<VirtualHost _default_:443> - -SSLEngine on -ServerName localvm:443 - -# generated using https://mozilla.github.io/server-side-tls/ssl-config-generator/ -# intermediate mode -# 2015.03.04 (with Apache v 2.2.22 and OpenSSL 1.0.1 and HSTS enabled) - -SSLProtocol all -SSLv2 -SSLv3 -SSLCipherSuite ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-DSS-AES128-GCM-SHA256:kEDH+AESGCM:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA:ECDHE-ECDSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-DSS-AES128-SHA256:DHE-RSA-AES256-SHA256:DHE-DSS-AES256-SHA:DHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:AES:CAMELLIA:DES-CBC3-SHA:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!aECDH:!EDH-DSS-DES-CBC3-SHA:!EDH-RSA-DES-CBC3-SHA:!KRB5-DES-CBC3-SHA -SSLHonorCipherOrder on - -# HSTS (mod_headers is required) (15768000 seconds = 6 months) -Header always add Strict-Transport-Security "max-age=15768000" - -SSLCertificateFile /etc/ssl/certs/server.crt -SSLCertificateKeyFile /etc/ssl/private/server.key - -WSGIDaemonProcess regluit-ssl processes=4 threads=4 python-eggs=/tmp/regluit-python-eggs -WSGIScriptAlias / /opt/regluit/deploy/localvm.wsgi - -<Directory /opt/regluit> - Options Indexes FollowSymLinks - AllowOverride None - - Order allow,deny - Allow from all -</Directory> - -<Directory /opt/regluit/static> - Options Indexes FollowSymLinks - AllowOverride None - - Order allow,deny - Allow from all -</Directory> - -Alias /static /var/www/static - -BrowserMatch "MSIE [2-6]" \ - nokeepalive ssl-unclean-shutdown \ - downgrade-1.0 force-response-1.0 -# MSIE 7 and newer should be able to use keepalive -BrowserMatch "MSIE [17-9]" ssl-unclean-shutdown - -</VirtualHost> diff --git a/deploy/setup-just.sh b/deploy/setup-just.sh deleted file mode 100644 index 216f7a8cb..000000000 --- a/deploy/setup-just.sh +++ /dev/null @@ -1,3 +0,0 @@ -cd /opt/regluit -source ENV/bin/activate -export DJANGO_SETTINGS_MODULE=regluit.settings.just diff --git a/deploy/setup-please.sh b/deploy/setup-please.sh deleted file mode 100644 index 0815d3612..000000000 --- a/deploy/setup-please.sh +++ /dev/null @@ -1,3 +0,0 @@ -cd /opt/regluit -source ENV/bin/activate -export DJANGO_SETTINGS_MODULE=regluit.settings.please diff --git a/distro/migrations/0001_initial.py b/distro/migrations/0001_initial.py index 1a4fbca15..44ef8b67f 100644 --- a/distro/migrations/0001_initial.py +++ b/distro/migrations/0001_initial.py @@ -41,6 +41,6 @@ class Migration(migrations.Migration): migrations.AddField( model_name='deposit', name='target', - field=models.ForeignKey(related_name='deposits', to='distro.Target'), + field=models.ForeignKey(on_delete=models.CASCADE, related_name='deposits', to='distro.Target'), ), ] diff --git a/distro/migrations/0002_auto_20200214_1347.py b/distro/migrations/0002_auto_20200214_1347.py new file mode 100644 index 000000000..a93207a12 --- /dev/null +++ b/distro/migrations/0002_auto_20200214_1347.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.28 on 2020-02-14 13:47 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('distro', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='target', + name='protocol', + field=models.CharField(default='ftp', max_length=10), + ), + ] diff --git a/distro/models.py b/distro/models.py index 5e9adb4b0..bb5df592e 100644 --- a/distro/models.py +++ b/distro/models.py @@ -1,7 +1,7 @@ import logging import requests from ftplib import FTP, FTP_TLS -from StringIO import StringIO +from io import StringIO from django.db import models logger = logging.getLogger(__name__) @@ -14,7 +14,7 @@ class Target(models.Model): protocol = models.CharField(max_length=10, default='ftp') formats = models.ManyToManyField('Format', related_name='targets') - def __unicode__(self): + def __str__(self): return self.name def get_ftp(self): @@ -57,7 +57,7 @@ def push_file(self, filename, file_to_push): class Deposit(models.Model): - target = models.ForeignKey(Target, related_name="deposits") + target = models.ForeignKey(Target, on_delete=models.CASCADE, related_name="deposits") isbn = models.CharField(max_length=13) format = models.CharField(max_length=30) updated = models.DateTimeField(auto_now_add=True) @@ -65,5 +65,5 @@ class Deposit(models.Model): class Format(models.Model): name = models.CharField(max_length=4) - def __unicode__(self): + def __str__(self): return self.name diff --git a/distro/push.py b/distro/push.py index 30c4fb4b1..4a485ff95 100644 --- a/distro/push.py +++ b/distro/push.py @@ -1,11 +1,12 @@ import logging from datetime import datetime -from StringIO import StringIO +from io import StringIO from regluit.core.facets import BaseFacet -from regluit.core.models import Work, good_providers +from regluit.core.models import Work from regluit.api.onix import onix_feed +from regluit.core.parameters import GOOD_PROVIDERS from .models import Target @@ -45,7 +46,7 @@ def __init__(self): editions__ebooks__created__gt = start, identifiers__type="isbn", editions__ebooks__format__in = formats, - editions__ebooks__provider__in = good_providers, + editions__ebooks__provider__in = GOOD_PROVIDERS, ).distinct().order_by('-featured') model_filters = {"Ebook": format_filter, "Edition": edition_format_filter} @@ -56,7 +57,10 @@ def __init__(self): return TargetFacet() def push_onix(target, facet_class): - target.push_file('unglueit_onix_{:%Y%m%d%H%M%S}.xml'.format(datetime.now()),StringIO(onix_feed(facet_class))) + target.push_file( + 'unglueit_onix_{:%Y%m%d%H%M%S}.xml'.format(datetime.now()), + StringIO(onix_feed(facet_class)) + ) def push_all(start=datetime(1900,1,1), new=False, max=0): for target in Target.objects.all(): diff --git a/frontend/forms/__init__.py b/frontend/forms/__init__.py index 59b132793..5f7afb8e6 100644 --- a/frontend/forms/__init__.py +++ b/frontend/forms/__init__.py @@ -12,8 +12,7 @@ from django import forms from django.conf import settings from django.contrib.auth.models import User -from django.forms.widgets import RadioSelect -from django.forms.extras.widgets import SelectDateWidget +from django.forms.widgets import RadioSelect, SelectDateWidget from django.utils.translation import ugettext_lazy as _ from selectable.forms import ( @@ -38,8 +37,6 @@ Work, Press, Libpref, - TWITTER, - FACEBOOK, UNGLUEITAR ) from regluit.libraryauth.models import Library @@ -48,6 +45,7 @@ REWARDS, BUY2UNGLUE, THANKS, + DONATION_CHOICES, ) from regluit.core.lookups import ( OwnerLookup, @@ -70,33 +68,9 @@ RightsHolderForm, UserClaimForm ) -from questionnaire.models import Questionnaire logger = logging.getLogger(__name__) -class SurveyForm(forms.Form): - label = forms.CharField(max_length=64, required=True) - survey = forms.ModelChoiceField(Questionnaire.objects.all(), widget=RadioSelect(), empty_label=None, required = True,) - isbn = ISBNField( - label=_("ISBN"), - max_length=17, - required = False, - help_text = _("13 digits, no dash."), - error_messages = { - 'invalid': _("This must be a valid ISBN-13."), - } - ) - - def clean_isbn(self): - isbn = self.cleaned_data['isbn'] - if not isbn: - return '' - try: - self.work = Identifier.objects.get(type='isbn', value=isbn).work - return isbn - except Identifier.DoesNotExist: - self.work = None - raise forms.ValidationError( 'That ISBN is not in our database') class EbookFileForm(forms.ModelForm): file = forms.FileField(max_length=16777216) @@ -110,7 +84,7 @@ def __init__(self, campaign_type=BUY2UNGLUE, *args, **kwargs): self.fields['format'].widget = forms.HiddenInput() if campaign_type == THANKS: self.fields['format'].widget = forms.Select( - choices = (('pdf', 'PDF'), ('epub', 'EPUB'), ('mobi', 'MOBI')) + choices = (('pdf', 'PDF'), ('epub', 'EPUB')) ) def clean_version_label(self): @@ -182,13 +156,11 @@ def clean(self): return self.cleaned_data class ProfileForm(forms.ModelForm): - clear_facebook = forms.BooleanField(required=False) clear_twitter = forms.BooleanField(required=False) - clear_goodreads = forms.BooleanField(required=False) class Meta: model = UserProfile - fields = 'tagline', 'librarything_id', 'home_url', 'clear_facebook', 'clear_twitter', 'clear_goodreads', 'avatar_source' + fields = 'tagline', 'librarything_id', 'home_url', 'avatar_source' widgets = { 'tagline': forms.Textarea(attrs={'rows': 5, 'onKeyUp': "counter(this, 140)", 'onBlur': "counter(this, 140)"}), } @@ -198,22 +170,9 @@ def __init__(self, *args, **kwargs): super(ProfileForm, self).__init__(*args, **kwargs) choices = [] for choice in self.fields['avatar_source'].choices : - if choice[0] == FACEBOOK and not profile.facebook_id: - pass - elif choice[0] == TWITTER and not profile.twitter_id: - pass - else: - choices.append(choice) + choices.append(choice) self.fields['avatar_source'].choices = choices - def clean(self): - # check that if a social net is cleared, we're not using it a avatar source - if self.cleaned_data.get("clear_facebook", False) and self.cleaned_data.get("avatar_source", None) == FACEBOOK: - self.cleaned_data["avatar_source"] == UNGLUEITAR - if self.cleaned_data.get("clear_twitter", False) and self.cleaned_data.get("avatar_source", None) == TWITTER: - self.cleaned_data["avatar_source"] == UNGLUEITAR - return self.cleaned_data - def getTransferCreditForm(maximum, data=None, *args, **kwargs ): class TransferCreditForm(forms.Form): recipient = AutoCompleteSelectField( @@ -292,9 +251,7 @@ class Meta: class CampaignPurchaseForm(forms.Form): - anonymous = forms.BooleanField(required=False, - label_suffix='', - label=_("Make this purchase anonymous")) + anonymous = forms.BooleanField(required=False, label=_("Make this purchase anonymous, please")) offer_id = forms.IntegerField(required=False) offer = None library_id = forms.IntegerField(required=False) @@ -359,12 +316,11 @@ def trans_extra(self): class CampaignThanksForm(forms.Form): anonymous = forms.BooleanField( required=False, - label_suffix='', - label=_("Make this contribution anonymous") + label=_("Make this contribution anonymous, please") ) preapproval_amount = forms.DecimalField( required = True, - min_value=D('1.00'), + min_value=D('2.00'), max_value=D('2000.00'), decimal_places=2, label="Pledge Amount", @@ -374,19 +330,21 @@ def trans_extra(self): pe = PledgeExtra( anonymous=self.cleaned_data['anonymous'] ) class DonationForm(forms.Form): + # used only for validation; not currently used for display amount = forms.DecimalField( required = True, - min_value=D('1.00'), + min_value=D('5.00'), max_value=D('20000.00'), decimal_places=2, label="Donation Amount", ) + reason = forms.ChoiceField(choices=DONATION_CHOICES, required=False) class CampaignPledgeForm(forms.Form): preapproval_amount = forms.DecimalField( required = False, - min_value=D('1.00'), + min_value=D('2.00'), max_value=D('5000.00'), decimal_places=2, label="Support Amount", @@ -394,10 +352,7 @@ class CampaignPledgeForm(forms.Form): def amount(self): return self.cleaned_data["preapproval_amount"] if self.cleaned_data else None - anonymous = forms.BooleanField( - required=False, - label_suffix='', - label=_("Make this support anonymous")) + anonymous = forms.BooleanField(required=False, label=_("Make this support anonymous, please")) ack_name = forms.CharField( required=False, max_length=64, @@ -454,6 +409,7 @@ class TokenCCMixin(forms.Form): class BaseCCMixin(forms.Form): work_id = forms.IntegerField(required=False, widget=forms.HiddenInput()) + reason = forms.CharField(required=False, widget=forms.HiddenInput()) preapproval_amount = forms.DecimalField( required=False, min_value=D('1.00'), @@ -479,11 +435,6 @@ class CCForm(UserCCMixin, BaseCCForm): class AccountCCForm( BaseCCMixin, UserCCMixin, forms.Form): pass -class GoodreadsShelfLoadingForm(forms.Form): - goodreads_shelf_name_number = forms.CharField(widget=forms.Select(choices=( - ('all','all'), - ))) - class LibraryThingForm(forms.Form): lt_username = forms.CharField(max_length=30, required=True) @@ -548,14 +499,14 @@ class MsgForm(forms.Form): def full_clean(self): super(MsgForm, self).full_clean() - if self.data.has_key("supporter"): + if "supporter" in self.data: try: self.cleaned_data['supporter'] = User.objects.get(id=self.data["supporter"]) except User.DoesNotExist: raise ValidationError("Supporter does not exist") else: raise ValidationError("Supporter is not specified") - if self.data.has_key("work"): + if "work" in self.data: try: self.cleaned_data['work'] = Work.objects.get(id=self.data["work"]) except Work.DoesNotExist: diff --git a/frontend/forms/bibforms.py b/frontend/forms/bibforms.py index 54881461b..873a676a3 100644 --- a/frontend/forms/bibforms.py +++ b/frontend/forms/bibforms.py @@ -62,7 +62,10 @@ def clean(self): id_value = self.cleaned_data.get('id_value', '').strip() make_new = self.cleaned_data.get('make_new', False) if not make_new: - self.cleaned_data['id_value'] = identifier_cleaner(id_type)(id_value) + if id_value: + self.cleaned_data['id_value'] = identifier_cleaner(id_type)(id_value) + if not self.cleaned_data['id_value']: + self.add_error('id_value', forms.ValidationError('The identifier was not valid')) return self.cleaned_data class Meta: @@ -165,7 +168,7 @@ def clean(self): else: err_msg = "{} is a duplicate for work #{}.".format(id_value, ident.work_id) self.add_error('id_value', forms.ValidationError(err_msg)) - except forms.ValidationError, ve: + except forms.ValidationError as ve: self.add_error( 'id_value', forms.ValidationError('{}: {}'.format(ve.message, id_value)) diff --git a/frontend/forms/rh_forms.py b/frontend/forms/rh_forms.py index 09a9212dd..1a172107c 100644 --- a/frontend/forms/rh_forms.py +++ b/frontend/forms/rh_forms.py @@ -1,7 +1,7 @@ from datetime import date, timedelta from decimal import Decimal as D -from ckeditor.widgets import CKEditorWidget +from ckeditor_uploader.widgets import CKEditorUploadingWidget from selectable.forms import ( AutoCompleteSelectMultipleWidget, @@ -10,8 +10,7 @@ from django import forms from django.conf import settings -from django.forms.extras.widgets import SelectDateWidget -from django.forms.widgets import RadioSelect +from django.forms.widgets import RadioSelect, SelectDateWidget from django.utils.translation import ugettext_lazy as _ from django.utils.timezone import now @@ -206,7 +205,7 @@ def __init__(self, instance=None , **kwargs): required=False, ) if self.initial and not self.initial.get('edition', None) and not instance.edition: - self.initial['edition'] = instance.work.editions.all()[0] + self.initial['edition'] = instance.work.editions.first() paypal_receiver = forms.EmailField( label=_("contact email address for this campaign"), @@ -215,7 +214,7 @@ def __init__(self, instance=None , **kwargs): 'required': 'You must enter the email we should contact you at for this campaign.' }, ) - work_description = forms.CharField(required=False , widget=CKEditorWidget()) + work_description = forms.CharField(required=False , widget=CKEditorUploadingWidget()) class Meta: model = Campaign diff --git a/frontend/tasks.py b/frontend/tasks.py new file mode 100644 index 000000000..75c24cae6 --- /dev/null +++ b/frontend/tasks.py @@ -0,0 +1,17 @@ +from os.path import join +from datetime import date + +from celery.task import task + +from django.conf import settings +from django.template.loader import get_template, render_to_string + +from .views import InfoPageView + +@task +def save_info_page(): + page_view = InfoPageView() + page = render_to_string(page_view.template_name, context=page_view.get_context_data()) + today = date.today().isoformat() + with open(join(settings.CELERY_LOG_DIR, 'metrics-%s.html' % today), 'w') as todays_metrics: + todays_metrics.write(page) diff --git a/frontend/templates/503.html b/frontend/templates/503.html index 4403b25d5..e2b47b1ec 100644 --- a/frontend/templates/503.html +++ b/frontend/templates/503.html @@ -11,7 +11,7 @@ <h1>Unglue.it is currently undergoing maintenance</h1> <p> -While you wait, why not like us on <a href="https://facebook.com/unglueit">Facebook</a>, follow us on <a href="https://twitter.com/unglueit">Twitter</a>, or subscribe to our <a href="https://blog.unglue.it">blog</a>? We'll keep you up to date there with our progress fixing things. +While you wait, why not like us on <a href="https://facebook.com/unglueit">Facebook</a>, follow us on <a href="https://digipres.club/@unglueit">Mastodon</a>,h or subscribe to our <a href="https://blog.unglue.it">blog</a>? We'll keep you up to date there with our progress fixing things. </p> <p>You can also help us by <a href="{% url 'feedback' %}">sending us feedback</a>.</p> diff --git a/frontend/templates/_template_map.txt b/frontend/templates/_template_map.txt index fe1abdb94..bb9f5dffa 100644 --- a/frontend/templates/_template_map.txt +++ b/frontend/templates/_template_map.txt @@ -11,13 +11,11 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) about_unglued_empty.html about_wishlist.html about_wishlist_empty.html - base-questionnaire.html campaign_list.html extra_css extra_head cc_list.html extra_css extra_head comments.html extra_css extra_head download.html extra_js faceted_list.html extra_css extra_head - goodreads_display.html extra_head home.html extra_css extra_js kindle_change_successful.html extra_js libraryauth/library.html extra_head extra_css extra_js @@ -59,7 +57,6 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) manage_account.html extra_extra_head manage_campaign.html extra_extra_head manage_ebooks.html - manage_survey.html marc.html merge.html extra_extra_head metrics.html @@ -78,7 +75,6 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) rh_works.html rh_yours.html rights_holders.html extra_extra_head - surveys.html terms.html extra_css thanks.html basepledge.html extra_css extra_js extra_extra_head(empty) diff --git a/frontend/templates/about_funds.html b/frontend/templates/about_funds.html new file mode 100644 index 000000000..231cb6970 --- /dev/null +++ b/frontend/templates/about_funds.html @@ -0,0 +1,66 @@ +{% extends 'basedocumentation.html' %} +{% load sass_tags %} +{% block title %} Free Ebook Foundation Donations {% endblock %} +{% block extra_css %} +<link type="text/css" rel="stylesheet" href="{% sass_src 'scss/pledge.scss' %}" /> +{% endblock %} + +{% block topsection %} +{% endblock %} + +{% block doccontent %} + +<h2> Donating to Unglue.it </h2> +<p> +Unglue.it is a program of the <a href="https://ebookfoundation.org">Free Ebook Foundation</a> (FEF), which is a charitable, not-for-profit corporation. +Donations to the Free Ebook Foundation are tax-deductible in the United States. +</p> +<p> +When you donate to the Free Ebook Foundation, you can specify how you would like your donation to be used. There are currently two options: +</p> +<ul class="bullets"> +<li> <a href="#monographs">The FEF Open Access Monographs Fund</a>: to support the Ungluing of peer-reviewed monographs that advance scholarship, science and learning </li> +<li> <a href="#general">The FEF General Fund</a>: to support the operation and maintenance of the Foundation's programs, including Unglue.it. +</ul> +<h3 id="monographs">The FEF Open Access Monograph Fund</h3> +<p> +Scholars write books to spread their ideas, so it makes sense to make them free and available. +We refer to these books as "monographs" because they usually embody the scholarship of a single author. +Already, over 30,000 of these books are available to download from the Unglue.it database. +Sadly, many more books are locked up behind paywalls - not because their authors want to make money, but because the publishers of these books need to recoup the cost of editorial work and design. +Many new books will remain unpublished because publishers committed to Open Access have insufficient resources to publish all the books deserving of wider audiences. +</p> +<p> +As a small step towards addressing these needs, we're offering donors a chance to help us unglue more of these monographs by donating to a special fund. +The fund will be used to match contributions to qualified ungluing campaigns on Unglue.it. +To participate, authors should first work with a publisher to establish a campaign target, and then create an ungluing campaign. +To get started, follow the steps at our <a href="{% url 'rightsholders' %}">right holder tools page</a>. +Our staff will verify that the book has been or will be peer-reviewed and advances scholarship, science and learning. +Resources from the fund will be allocated to maximize the success of the eligible campaigns. +If you want to donate to a specific campaign, just donate to the campaign directly. +</p> +<h3 id="general">The FEF General Fund</h3> +<p> +If you prefer to support all the work of the Free Ebook Foundation, including Unglue.it, Free-Programming-Books, and our work supporting Project Gutenberg, just use the General Fund. +</p> + +<h2 id="donationform">Donate Now!</h2> + <div id="authorize" class="jsmod-content" > + <form class="askform" method="POST" action="{% url 'newdonation' %}"> + <p class=" form-row clearfix"> + <input id="id_reason_monographs" checked type="radio" value="monographs" name="reason"><label for="id_reason_monographs">FEF Open Access Monographs Fund</label> + </p> <p class=" form-row clearfix"> + <input id="id_reason_general" type="radio" value="general" name="reason"><label for="id_reason_general">FEF General Fund</label> + </p> + <div class="donate_amount clearfix"> + <label>Amount ($): </label><input id="amount" max="20000.00" min="5.00" name="amount" step="0.01" type="number" value="10.00" class="donate"></div> + <div class="button"> + <input name="pledge" type="submit" value="Donate" id="donatesubmit" class="donate" /> + </div> + + </form> + + </div> + + +{% endblock %} diff --git a/frontend/templates/about_lightbox_footer.html b/frontend/templates/about_lightbox_footer.html index c15891e66..753c3b077 100644 --- a/frontend/templates/about_lightbox_footer.html +++ b/frontend/templates/about_lightbox_footer.html @@ -5,5 +5,5 @@ {% if not user.is_authenticated %} <div class="signuptoday"><a class="btn btn-signup" href="{% url 'registration_register' %}?next={% if request.GET.next %}{{ request.GET.next|urlencode }}{% else %}{{ request.get_full_path|urlencode}}{% endif %}">Sign up today <i class="fa fa-chevron-right"></i></a></div> {% else %} - <div class="signuptoday"><a href="{% url 'campaign_list' 'ending' %}">Our campaigns</a></div> + <div class="signuptoday"><a href="{% url 'campaign_list' 't4u' %}">Our campaigns</a></div> {% endif %} diff --git a/frontend/templates/add_your_books.html b/frontend/templates/add_your_books.html index b285e3d5d..3ded0d4bf 100644 --- a/frontend/templates/add_your_books.html +++ b/frontend/templates/add_your_books.html @@ -1,10 +1,14 @@ +{% load cf %} + <script type="text/javascript" src="/static/js/watermark_init.js"></script> + <script type="text/javascript" src="/static/js/watermark_change.js"></script> <p id="add_your_books"><b>Claiming a work</b></p> <p>If your book is indexed in Google books, we can add it to our database automagically. Click on the result list to add your book to our database.</p> <form action="{% url 'search' %}" method="get"> - <div class="inputalign"> - <input type="text" id="watermark" size="25" onfocus="imgfocus()" onblur="imgblur(15)" class="inputbox" name="q" value="{{ q }}"><input type="hidden" name="page" value="2"> - <input type="submit" class="button"> - </div> + <div class="cf-turnstile" data-sitekey="{% cf_site %}" data-appearance="interaction-only" data-callback="enableSubmit"></div> + <div class="inputalign"> + <input type="text" id="nowatermark" size="25" class="inputbox" name="q" value="{{ q }}" required> + <input type="submit" class="button" id="search-button" disabled> + </div> </form> <ul class="bullets"> <li>Use the Claim option on the More... tab of each book's page.</li> diff --git a/frontend/templates/agreed.html b/frontend/templates/agreed.html index f2d8535b0..4b16772b1 100644 --- a/frontend/templates/agreed.html +++ b/frontend/templates/agreed.html @@ -3,7 +3,7 @@ {% block title %} Agreement Submitted {% endblock %} {% block extra_extra_head %} {{ block.super }} -<link rel="stylesheet" href="/static/css/ui-lightness/jquery-ui-1.8.16.custom.css" type="text/css" media="screen"> +<link rel="stylesheet" href="{{ jquery_ui_theme }}" type="text/css" media="screen"> <script type="text/javascript" src="{{ jquery_ui_home }}"></script> {% endblock %} diff --git a/frontend/templates/base-questionnaire.html b/frontend/templates/base-questionnaire.html deleted file mode 100644 index 961b60a60..000000000 --- a/frontend/templates/base-questionnaire.html +++ /dev/null @@ -1,38 +0,0 @@ -{% extends "base.html" %} -{% load landings %} -{% block title %}{{ block.super }}Questionnaire{% endblock title %} -{% block search_box %} -{% render_with_landing '' %} -<a href="{{landing_object.publishers.0.url}}"><img style="float:left;margin:10px" src="{{landing_object.publishers.0.logo_url}}" alt="{{landing_object.publishers.0.name}}" /></a> -{% endblock %} -{% block signin %} -{% endblock %} -{% block extra_css %} - <link rel="stylesheet" href="/static/bootstrap/bootstrap.min.css" type="text/css" /> - <link rel="stylesheet" href="/static/questionnaire.css" /> - <style type="text/css"> - {% block styleextra %} - {% endblock %} - </style> -{% endblock %} -{% block extra_head %} - {% block headextra %} - {% endblock %} -{% endblock %} - -{% block language %} - {% for lang in LANGUAGES %} - {% if not forloop.first %} | {% endif %} - <a href="/setlang/?lang={{ lang.0 }}&next={{ request.path }}">{{ lang.1 }}</a> - {% endfor %} -{% endblock language %} - -{% block content %} -<div id="main-container"> - <div class="js-main"> - - {% block questionnaire %}{% endblock questionnaire %} - </div> -</div> - -{% endblock %} diff --git a/frontend/templates/base.html b/frontend/templates/base.html index 95dc6115e..4dbf01f0a 100644 --- a/frontend/templates/base.html +++ b/frontend/templates/base.html @@ -1,28 +1,26 @@ <!DOCTYPE html> -{% load truncatechars %}{% load sass_tags %} +{% load sass_tags %} -<html lang="en"> +<html> <head> <meta charset="utf-8" /> - <meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="referrer" content="origin" /> <title>unglue.it {% block title %}{% endblock %} - {% block extra_meta %}{% endblock %} + {% block extra_css %}{% endblock %} - - - - - - + - - - + + + {% block extra_js %} {% endblock %} {% if show_langs %} @@ -51,86 +49,87 @@
-
-