Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ function Readability(doc, options) {
this._disableJSONLD = !!options.disableJSONLD;
this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
this._linkDensityModifier = options.linkDensityModifier || 0;
/**
* If true, keep the first in-article H1/H2 that duplicates the article title
* and leave H1 tags in the extracted content. Defaults to false (strip the
* duplicate title header and normalize remaining H1 elements to H2).
*/
this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders;

// Start with all flags set
this._flags =
Expand Down Expand Up @@ -835,11 +841,13 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");

// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(
this._getAllNodesWithTag(articleContent, ["h1"]),
"h2"
);
if (!this._keepOriginalTitleHeaders) {
// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(
this._getAllNodesWithTag(articleContent, ["h1"]),
"h2"
);
}

// Remove extra paragraphs
this._removeNodes(
Expand Down Expand Up @@ -1064,7 +1072,7 @@ Readability.prototype = {
var elementsToScore = [];
var node = this._doc.documentElement;

let shouldRemoveTitleHeader = true;
let shouldRemoveTitleHeader = !this._keepOriginalTitleHeaders;

while (node) {
if (node.tagName === "HTML") {
Expand Down
6 changes: 6 additions & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ export interface ReadabilityOptions<T = string> {
* Defaults to 1.
*/
linkDensityModifier?: number;
/**
* If `true`, the first in-article heading that closely matches the article
* title is kept, and H1 tags in the extracted content are not rewritten to H2.
* Defaults to `false`.
*/
keepOriginalTitleHeaders?: boolean;
}

export class Readability<T = string> {
Expand Down
83 changes: 83 additions & 0 deletions test/test-keep-original-title-headers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/* eslint-env node, mocha */

var JSDOM = require("jsdom").JSDOM;
var chai = require("chai");
var expect = chai.expect;

var Readability = require("../index").Readability;

function articleHtml(titleText, headingTag, headingText) {
var long =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
"eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
"minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
"in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
return (
"<!DOCTYPE html><html><head><title>" +
titleText +
"</title></head><body><article>" +
"<" +
headingTag +
">" +
headingText +
"</" +
headingTag +
"><p>" +
long +
"</p><p>" +
long +
"</p></article></body></html>"
);
}

describe("keepOriginalTitleHeaders option", function () {
this.timeout(30000);

it("when false, removes the first heading that duplicates the title and rewrites other H1 to H2", function () {
var titleText = "Readability Title Headers Option Test 7f3a";
var source = articleHtml(titleText, "h1", titleText);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc).parse();
expect(result.content).to.not.include("<h1>");
expect(result.content).to.not.include("<h2>" + titleText);
expect(result.title).to.eql(titleText);
});

it("when true, keeps the duplicate title header as H1 and does not rewrite it to H2", function () {
var titleText = "Readability Title Headers Option Test 7f3b";
var source = articleHtml(titleText, "h1", titleText);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc, {
keepOriginalTitleHeaders: true,
}).parse();
expect(result.content).to.include("<h1>" + titleText + "</h1>");
expect(result.title).to.eql(titleText);
});

it("when false, rewrites a non-title H1 in the article body to H2", function () {
var titleText = "Readability Title Headers Option Test 7f3c";
var bodyHeading = "Distinct In Article Heading 9z2q";
var source = articleHtml(titleText, "h1", bodyHeading);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc).parse();
expect(result.content).to.include("<h2>" + bodyHeading + "</h2>");
expect(result.content).to.not.include("<h1>" + bodyHeading);
});

it("when true, leaves a non-title H1 in the article body as H1", function () {
var titleText = "Readability Title Headers Option Test 7f3d";
var bodyHeading = "Distinct In Article Heading 9z2r";
var source = articleHtml(titleText, "h1", bodyHeading);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc, {
keepOriginalTitleHeaders: true,
}).parse();
expect(result.content).to.include("<h1>" + bodyHeading + "</h1>");
expect(result.content).to.not.include("<h2>" + bodyHeading);
});
});
8 changes: 8 additions & 0 deletions test/test-readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,14 @@ describe("Readability API", function () {
);
});

it("should accept a keepOriginalTitleHeaders option", function () {
expect(new Readability(doc)._keepOriginalTitleHeaders).eql(false);
expect(
new Readability(doc, { keepOriginalTitleHeaders: true })
._keepOriginalTitleHeaders
).eql(true);
});

it("should accept a allowedVideoRegex option or default it", function () {
expect(new Readability(doc)._allowedVideoRegex).eql(
Readability.prototype.REGEXPS.videos
Expand Down