diff --git a/conf/openlibrary.yml b/conf/openlibrary.yml index e75fd373088..68d6c5db1ee 100644 --- a/conf/openlibrary.yml +++ b/conf/openlibrary.yml @@ -190,3 +190,18 @@ sentry_cron_jobs: # Observations cache settings: observation_cache_duration: 86400 + +# Proxy configuration. +# http_proxy sets the global default (no auth) via HTTP_PROXY/HTTPS_PROXY env vars. +# http_proxies overrides per service with credentials; each entry has url/user/password. +# Dev/local: leave both unset — no proxy needed. +# http_proxy: http://squid.example.com:3128 +# http_proxies: +# recaptcha: +# url: http://squid.example.com:3128 +# user: '' +# password: '' +# amazon: +# url: http://squid.example.com:3128 +# user: '' +# password: '' diff --git a/openlibrary/core/vendors.py b/openlibrary/core/vendors.py index a038af74625..44d1a6a5aef 100644 --- a/openlibrary/core/vendors.py +++ b/openlibrary/core/vendors.py @@ -423,6 +423,93 @@ def __init__( # AmazonCreatorsApi; replace its rest_client to route all # outbound HTTP through the proxy. self.api._api_client.rest_client = rest_client + + # Also inject a proxy-aware OAuth2 token manager. The SDK's + # OAuth2TokenManager.refresh_token() calls bare requests.post() + # which reads HTTP_PROXY from the environment. After this PR + # lands, HTTP_PROXY will be a bare (unauthenticated) squid URL; + # Amazon's token endpoint requires authenticated proxy access, so + # the bare URL would produce a 403. We override refresh_token() + # here to use a requests.Session with the authenticated proxy URL + # embedded directly, bypassing the env-var lookup entirely. + if proxy_creds: + from urllib.parse import ( + quote as _urlquote, + ) + from urllib.parse import ( + urlparse as _urlparse, + ) + from urllib.parse import ( + urlunparse as _urlunparse, + ) + + from creatorsapi_python_sdk.auth.oauth2_config import ( + OAuth2Config as _OAuth2Config, + ) + from creatorsapi_python_sdk.auth.oauth2_token_manager import ( + OAuth2TokenManager as _OAuth2TokenManager, + ) + + _user, _, _password = proxy_creds.partition(":") + _parsed = _urlparse(proxy_url) + _netloc = f"{_urlquote(_user, safe='')}:{_urlquote(_password, safe='')}@{_parsed.hostname}" + if _parsed.port: + _netloc += f":{_parsed.port}" + _auth_proxy_url = _urlunparse(_parsed._replace(netloc=_netloc)) + _proxies = {"http": _auth_proxy_url, "https": _auth_proxy_url} + + class _ProxyAwareTokenManager(_OAuth2TokenManager): + """Routes OAuth2 token refresh through authenticated proxy.""" + + def refresh_token(self): + import requests as _req + + session = _req.Session() + session.proxies = _proxies + try: + if self.config.is_lwa(): + resp = session.post( + self.config.get_cognito_endpoint(), + json={ + "grant_type": self.config.get_grant_type(), + "client_id": self.config.get_credential_id(), + "client_secret": self.config.get_credential_secret(), + "scope": self.config.get_scope(), + }, + headers={"Content-Type": "application/json"}, + ) + else: + resp = session.post( + self.config.get_cognito_endpoint(), + data={ + "grant_type": self.config.get_grant_type(), + "client_id": self.config.get_credential_id(), + "client_secret": self.config.get_credential_secret(), + "scope": self.config.get_scope(), + }, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + ) + if resp.status_code != 200: + raise Exception(f"OAuth2 token request failed with status {resp.status_code}: {resp.text}") + data = resp.json() + if "access_token" not in data: + raise Exception("No access token received from OAuth2 endpoint") + self.access_token = data["access_token"] + self.expires_at = time.time() + data.get("expires_in", 3600) - 30 + return self.access_token + except Exception: + self.clear_token() + raise + + api_client = self.api._api_client + _oauth_config = _OAuth2Config( + api_client.credential_id, + api_client.credential_secret, + api_client.version, + api_client.auth_endpoint, + ) + api_client._token_manager = _ProxyAwareTokenManager(_oauth_config) + except (ImportError, AttributeError): logger.warning( "AmazonCreatorsAPI: could not inject proxy — falling back to environment-level proxy (HTTPS_PROXY)", diff --git a/openlibrary/plugins/recaptcha/recaptcha.py b/openlibrary/plugins/recaptcha/recaptcha.py index 08d09a04a6f..803556d9915 100644 --- a/openlibrary/plugins/recaptcha/recaptcha.py +++ b/openlibrary/plugins/recaptcha/recaptcha.py @@ -6,6 +6,7 @@ import web from infogami import config +from openlibrary.plugins.upstream.utils import get_proxy_params logger = logging.getLogger("openlibrary") @@ -44,7 +45,7 @@ def accept_error(error_codes: list[str]) -> bool: } try: - r = requests.get(url, params=params, timeout=3) + r = requests.get(url, params=params, timeout=3, proxies=get_proxy_params("recaptcha")) except requests.exceptions.RequestException: logger.exception("Recaptcha call failed: letting user through") return True diff --git a/openlibrary/plugins/upstream/tests/test_utils.py b/openlibrary/plugins/upstream/tests/test_utils.py index b9acbfee836..7a36a159346 100644 --- a/openlibrary/plugins/upstream/tests/test_utils.py +++ b/openlibrary/plugins/upstream/tests/test_utils.py @@ -383,3 +383,51 @@ def test_get_language_name(add_languages): # noqa: F811 assert utils.get_language_name("/languages/ger", "en") == "German" # Falls back to name when translation missing for requested language assert utils.get_language_name("/languages/ger", "fr") == "Deutsch" + + +class TestGetProxyParams: + def test_no_http_proxies_config(self): + with patch("openlibrary.plugins.upstream.utils.config") as mock_config: + mock_config.get.return_value = {} + assert utils.get_proxy_params("recaptcha") is None + + def test_unknown_service_tag(self): + with patch("openlibrary.plugins.upstream.utils.config") as mock_config: + mock_config.get.return_value = {"amazon": {"url": "http://proxy:3128"}} + assert utils.get_proxy_params("recaptcha") is None + + def test_url_only_no_auth(self): + with patch("openlibrary.plugins.upstream.utils.config") as mock_config: + mock_config.get.return_value = {"recaptcha": {"url": "http://proxy:3128"}} + result = utils.get_proxy_params("recaptcha") + assert result == {"http": "http://proxy:3128", "https": "http://proxy:3128"} + + def test_url_with_auth(self): + with patch("openlibrary.plugins.upstream.utils.config") as mock_config: + mock_config.get.return_value = { + "recaptcha": { + "url": "http://proxy:3128", + "user": "myuser", + "password": "mypass", + } + } + result = utils.get_proxy_params("recaptcha") + assert result == { + "http": "http://myuser:mypass@proxy:3128", + "https": "http://myuser:mypass@proxy:3128", + } + + def test_special_chars_in_credentials_are_encoded(self): + with patch("openlibrary.plugins.upstream.utils.config") as mock_config: + mock_config.get.return_value = { + "recaptcha": { + "url": "http://proxy:3128", + "user": "u@ser", + "password": "p@ss:word", + } + } + result = utils.get_proxy_params("recaptcha") + assert result == { + "http": "http://u%40ser:p%40ss%3Aword@proxy:3128", + "https": "http://u%40ser:p%40ss%3Aword@proxy:3128", + } diff --git a/openlibrary/plugins/upstream/utils.py b/openlibrary/plugins/upstream/utils.py index 82eb0a96f91..0531540c45b 100644 --- a/openlibrary/plugins/upstream/utils.py +++ b/openlibrary/plugins/upstream/utils.py @@ -1623,6 +1623,37 @@ def setup_requests(config=config) -> None: logger.info("Requests set up") +def get_proxy_params(service_tag: str) -> dict[str, str] | None: + """Return a requests-compatible proxies dict for a service requiring proxy auth. + + Reads from the ``http_proxies`` config section. Each entry may have: + url: proxy base URL + user: proxy username + password: proxy password + + Returns None when no service-specific config exists so that callers can + pass the result directly as ``proxies=`` to requests — None means requests + will fall back to the global HTTP_PROXY/HTTPS_PROXY env vars set by + setup_requests(). + """ + service = config.get("http_proxies", {}).get(service_tag) + if not service: + return None + + proxy_url = service.get("url", "") + user = service.get("user", "") + password = service.get("password", "") + + if user and proxy_url: + parsed = urlparse(proxy_url) + netloc = f"{quote(user, safe='')}:{quote(password, safe='')}@{parsed.hostname}" + if parsed.port: + netloc += f":{parsed.port}" + proxy_url = urlunparse(parsed._replace(netloc=netloc)) + + return {"http": proxy_url, "https": proxy_url} if proxy_url else None + + def setup() -> None: """Do required initialization""" # monkey-patch get_markdown to use OL Flavored Markdown diff --git a/scripts/affiliate_server.py b/scripts/affiliate_server.py index 00c648bcd03..00ac9a2f2e7 100644 --- a/scripts/affiliate_server.py +++ b/scripts/affiliate_server.py @@ -638,8 +638,15 @@ def GET(self, identifier: str) -> str: def load_config(configfile): # This loads openlibrary.yml + infobase.yml openlibrary_load_config(configfile) - http_proxy_url = config.get("http_proxy") - http_proxy_creds = config.get("http_proxy_creds") + + # Prefer per-service proxy config under http_proxies.amazon; fall back to the + # legacy flat keys http_proxy / http_proxy_creds for backward compatibility. + amazon_proxy_cfg = config.get("http_proxies", {}).get("amazon", {}) + http_proxy_url = amazon_proxy_cfg.get("url") or config.get("http_proxy") + if amazon_proxy_cfg.get("user"): + http_proxy_creds = f"{amazon_proxy_cfg['user']}:{amazon_proxy_cfg.get('password', '')}" + else: + http_proxy_creds = config.get("http_proxy_creds", "") stats.client = stats.create_stats_client(cfg=config)