FIX: ignore malformed HTML for title extraction (#18040)

Certain HTML can be rejected by nokogumbo, specifically cases where there
are enormous amounts of attributes

This ensures that malformed HTML is simply skipped instead of leaking out
an exception and terminating downstream processes.
This commit is contained in:
Sam
2022-08-23 15:03:57 +10:00
committed by GitHub
parent 5d44c31bfa
commit df04462475
2 changed files with 20 additions and 1 deletions

View File

@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do
)
expect(title).to eq("Video Title")
end
it "will not exception out for invalid html" do
attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
title = RetrieveTitle.extract_title <<~HTML
<html>
<title>test</title>
<body #{attributes}>
</html>
HTML
expect(title).to eq(nil)
end
end
describe ".crawl" do