diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb
index 2a8ff118e3f..6de79fca7ad 100644
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -18,8 +18,15 @@ module RetrieveTitle
if html =~ /
/ && html !~ /<\/title>/
return nil
end
- if doc = Nokogiri::HTML5(html, nil, encoding)
+ doc = nil
+ begin
+ doc = Nokogiri::HTML5(html, nil, encoding)
+ rescue ArgumentError
+ # invalid HTML (too many attributes) - ignore
+ end
+
+ if doc
title = doc.at('title')&.inner_text
# A horrible hack - YouTube uses `document.title` to populate the title
diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb
index 7edd62ff08a..b59ff8d1b20 100644
--- a/spec/lib/retrieve_title_spec.rb
+++ b/spec/lib/retrieve_title_spec.rb
@@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do
)
expect(title).to eq("Video Title")
end
+
+ it "will not exception out for invalid html" do
+ attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
+ title = RetrieveTitle.extract_title <<~HTML
+
+ test
+
+
+ HTML
+
+ expect(title).to eq(nil)
+ end
end
describe ".crawl" do