Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 197 additions & 0 deletions readthedocs/projects/admin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"""Django administration interface for `projects.models`."""

import re
from urllib.parse import urlparse

from django import forms
from django.conf import settings
from django.contrib import admin
from django.contrib import messages
Expand All @@ -10,6 +14,8 @@
from django.db.models import Sum
from django.db.models import Value
from django.forms import BaseInlineFormSet
from django.template.response import TemplateResponse
from django.urls import path
from django.utils.html import format_html
from django.utils.translation import gettext_lazy as _

Expand Down Expand Up @@ -211,6 +217,99 @@ def queryset(self, request, queryset):
return queryset


class SpamRuleChecksFromURLsForm(forms.Form):
"""Form to paste URLs and queue spam rule checks for the matching projects."""

urls = forms.CharField(
label="URLs",
widget=forms.Textarea(attrs={"rows": 20, "cols": 100}),
help_text=(
"One URL per line. Both documentation URLs "
"(https://<slug>.readthedocs.io/...) and dashboard URLs "
"(https://app.readthedocs.org/projects/<slug>/...) are accepted. "
"Messy inputs from automated reports are tolerated: defanged URLs "
"(hxxps://, [.] / (.)), surrounding brackets or quotes, missing "
"scheme, and trailing punctuation are normalized automatically."
),
)


# Surrounding characters often added by mail clients, markdown, defanging
# tools, or word-wrapping that should be stripped before parsing the URL.
_URL_STRIP_CHARS = " \t\r\n\"'<>()[]{}.,;!?`"


def _normalize_url(value):
"""
Best-effort normalization of a possibly-defanged or messy URL.

Handles forms commonly seen in abuse reports and emails: ``hxxps://``,
``[.]``/``(.)`` separators, surrounding angle/square brackets, markdown
``[text](url)`` links, trailing punctuation, missing scheme, etc.
"""
if not value:
return ""

value = value.strip()

# Markdown link: [label](http...)
md_link = re.match(r"^\[[^\]]*\]\((.+)\)$", value)
if md_link:
value = md_link.group(1)

value = value.strip(_URL_STRIP_CHARS)

# Undefang common patterns used in security reports.
value = re.sub(r"^hxxp(s?)\b", r"http\1", value, flags=re.IGNORECASE)
value = value.replace("[.]", ".").replace("(.)", ".").replace("{.}", ".")
value = value.replace("[:]", ":").replace("[/]", "/")

return value


def _extract_project_slug_from_url(url):
"""
Extract a project slug from a Read the Docs URL.

Supports docs subdomain URLs like ``https://<slug>.readthedocs.io/...`` and
dashboard URLs like ``https://readthedocs.org/projects/<slug>/...``. Tries
to be tolerant of messy inputs (defanged URLs, missing scheme, surrounding
brackets/quotes) so admins can paste output from automated reporting tools
directly. Returns ``None`` when no slug can be extracted.
"""
if url is None:
return None

cleaned = _normalize_url(url)
if not cleaned:
return None

# urlparse needs a scheme to populate ``hostname``. Add one if missing.
if "://" not in cleaned:
cleaned = "https://" + cleaned.lstrip("/")

try:
parsed = urlparse(cleaned)
except ValueError:
return None

hostname = (parsed.hostname or "").lower()

# Dashboard URLs: <something>/projects/<slug>/...
path_parts = [p for p in parsed.path.split("/") if p]
if len(path_parts) >= 2 and path_parts[0] == "projects":
return path_parts[1] or None

# Docs subdomain URLs: <slug>.<PUBLIC_DOMAIN>
public_domain = (settings.PUBLIC_DOMAIN or "").lower()
if hostname and public_domain and hostname.endswith("." + public_domain):
subdomain = hostname[: -(len(public_domain) + 1)]
# Only the leftmost label is the project slug.
return subdomain.split(".")[0] or None

return None


@admin.register(Project)
class ProjectAdmin(ExtraSimpleHistoryAdmin):
"""Project model admin view."""
Expand Down Expand Up @@ -262,6 +361,104 @@ class ProjectAdmin(ExtraSimpleHistoryAdmin):
"import_tags_from_vcs",
]

def get_urls(self):
urls = super().get_urls()
custom_urls = [
path(
"spam-rule-checks-from-urls/",
self.admin_site.admin_view(self.spam_rule_checks_from_urls_view),
name="projects_project_spam_rule_checks_from_urls",
),
]
return custom_urls + urls

def spam_rule_checks_from_urls_view(self, request):
"""
Run spam rule checks on projects identified by a list of URLs.

Same effect as the ``run_spam_rule_checks`` admin action, but accepts
URLs (one per line) instead of a queryset selection so admins don't
have to convert URLs to project slugs by hand.
"""
form = SpamRuleChecksFromURLsForm(request.POST or None)
results = None
if request.method == "POST" and form.is_valid():
raw_urls = [line.strip() for line in form.cleaned_data["urls"].splitlines()]
raw_urls = [url for url in raw_urls if url]

unparseable = []
slug_to_url = {}
for url in raw_urls:
slug = _extract_project_slug_from_url(url)
if slug:
slug_to_url.setdefault(slug, url)
else:
unparseable.append(url)

found_projects = list(
Project.objects.filter(slug__in=slug_to_url.keys()).values_list(
"slug", flat=True
)
)
missing_slugs = sorted(set(slug_to_url) - set(found_projects))

if found_projects:
if "readthedocsext.spamfighting" in settings.INSTALLED_APPS:
from readthedocsext.spamfighting.tasks import ( # noqa
spam_rules_check,
)

spam_rules_check.delay(project_slugs=list(found_projects))
messages.add_message(
request,
messages.INFO,
"Spam check task triggered for {} project(s).".format(
len(found_projects)
),
)
else:
messages.add_message(
request,
messages.ERROR,
"Spam fighting Django application not installed",
)

if missing_slugs:
messages.add_message(
request,
messages.WARNING,
"No project found for slug(s): {}".format(", ".join(missing_slugs)),
)
if unparseable:
messages.add_message(
request,
messages.WARNING,
"Could not extract a project slug from URL(s): {}".format(
", ".join(unparseable)
),
)
if not raw_urls:
messages.add_message(request, messages.ERROR, "No URLs provided")

results = {
"matched_slugs": sorted(found_projects),
"missing_slugs": missing_slugs,
"unparseable_urls": unparseable,
}

context = {
**self.admin_site.each_context(request),
"title": "Run spam rule checks from URLs",
"opts": self.model._meta,
"form": form,
"results": results,
}
return TemplateResponse(
request,
"admin/projects/project/spam_rule_checks_from_urls.html",
context,
)

def get_queryset(self, request):
qs = super().get_queryset(request)

Expand Down
74 changes: 74 additions & 0 deletions readthedocs/rtd_tests/tests/projects/test_admin_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from django.test import TestCase

from readthedocs.core.models import UserProfile
from readthedocs.projects.admin import _extract_project_slug_from_url
from readthedocs.projects.models import Project


Expand Down Expand Up @@ -57,6 +58,79 @@
self.assertFalse(self.project.users.filter(profile__banned=True).exists())
self.assertEqual(self.project.users.filter(profile__banned=False).count(), 2)

def test_extract_project_slug_from_dashboard_url(self):
assert (
_extract_project_slug_from_url(
"https://readthedocs.org/projects/pip/builds/12345/"
)
== "pip"
)

def test_extract_project_slug_from_subdomain_url(self):
assert (
_extract_project_slug_from_url("https://pip.readthedocs.io/en/latest/")
== "pip"
)

def test_extract_project_slug_from_unknown_url_returns_none(self):
assert _extract_project_slug_from_url("https://example.com/foo/bar") is None

def test_extract_project_slug_from_messy_urls(self):
cases = {
# Defanged with hxxps and [.]
"hxxps://pip[.]readthedocs[.]io/en/latest/": "pip",
# Defanged with (.)
"hxxp://pip(.)readthedocs(.)io/": "pip",
# Wrapped in angle brackets (mail clients)
"<https://pip.readthedocs.io/>": "pip",
# Trailing punctuation
"https://pip.readthedocs.io/.": "pip",
"https://pip.readthedocs.io/,": "pip",
# Surrounding quotes
'"https://pip.readthedocs.io/"': "pip",
# Markdown link form
"[pip docs](https://pip.readthedocs.io/en/latest/)": "pip",
# No scheme, just hostname
"pip.readthedocs.io": "pip",
# No scheme, dashboard path
"readthedocs.org/projects/pip/": "pip",
# Surrounding whitespace
" https://pip.readthedocs.io/ ": "pip",
}
for raw, expected in cases.items():
assert _extract_project_slug_from_url(raw) == expected, (
f"failed for {raw!r}"
)

def test_extract_project_slug_handles_none_and_empty(self):
assert _extract_project_slug_from_url(None) is None
assert _extract_project_slug_from_url("") is None
assert _extract_project_slug_from_url(" ") is None

def test_spam_rule_checks_from_urls_view_get(self):
resp = self.client.get(
urls.reverse("admin:projects_project_spam_rule_checks_from_urls"),
)
assert resp.status_code == 200

def test_spam_rule_checks_from_urls_view_post(self):
urls_text = "\n".join(
[
f"https://{self.project.slug}.readthedocs.io/en/latest/",
"https://no-such-project-slug.readthedocs.io/",
"https://example.com/not/a/project",
]
)
resp = self.client.post(
urls.reverse("admin:projects_project_spam_rule_checks_from_urls"),
{"urls": urls_text},
)
assert resp.status_code == 200
content = resp.content.decode()
assert self.project.slug in content
assert "no-such-project-slug" in content
assert "example.com" in content

Check failure

Code scanning / CodeQL

Incomplete URL substring sanitization High test

The string
example.com
may be at an arbitrary position in the sanitized URL.

@mock.patch("readthedocs.projects.admin.clean_project_resources")
def test_project_delete(self, clean_project_resources):
"""Test project and artifacts are removed."""
Expand Down
11 changes: 11 additions & 0 deletions readthedocs/templates/admin/projects/project/change_list.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{% extends "admin/change_list.html" %}
{% load i18n %}

{% block object-tools-items %}
<li>
<a href="{% url 'admin:projects_project_spam_rule_checks_from_urls' %}">
{% translate 'Run spam rule checks from URLs' %}
</a>
</li>
{{ block.super }}
{% endblock %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls %}

{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% translate 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:app_list' app_label=opts.app_label %}">{{ opts.app_config.verbose_name }}</a>
&rsaquo; <a href="{% url 'admin:projects_project_changelist' %}">{{ opts.verbose_name_plural|capfirst }}</a>
&rsaquo; {{ title }}
</div>
{% endblock %}

{% block content %}
<h1>{{ title }}</h1>

<p>
Paste URLs of projects (one per line) to queue spam rule checks for them.
This is the same operation as the
<em>Run spam rule checks</em> admin action, but it accepts URLs and resolves
them to project slugs for you.
</p>

{% if results %}
<h2>Results</h2>
<ul>
{% if results.matched_slugs %}
<li>Matched project slugs ({{ results.matched_slugs|length }}):
<code>{{ results.matched_slugs|join:", " }}</code></li>
{% endif %}
{% if results.missing_slugs %}
<li>Slugs with no matching project ({{ results.missing_slugs|length }}):
<code>{{ results.missing_slugs|join:", " }}</code></li>
{% endif %}
{% if results.unparseable_urls %}
<li>URLs that could not be parsed ({{ results.unparseable_urls|length }}):
<code>{{ results.unparseable_urls|join:", " }}</code></li>
{% endif %}
</ul>
{% endif %}

<form method="post">{% csrf_token %}
<fieldset class="module aligned">
{% for field in form %}
<div class="form-row">
{{ field.label_tag }}
{{ field }}
{% if field.help_text %}<div class="help">{{ field.help_text|safe }}</div>{% endif %}
{{ field.errors }}
</div>
{% endfor %}
</fieldset>
<div class="submit-row">
<input type="submit" value="{% translate 'Run spam rule checks' %}" class="default">
</div>
</form>
{% endblock %}
Loading