diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 2a8ff118e3f..6de79fca7ad 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -18,8 +18,15 @@ module RetrieveTitle if html =~ // && html !~ /<\/title>/ return nil end - if doc = Nokogiri::HTML5(html, nil, encoding) + doc = nil + begin + doc = Nokogiri::HTML5(html, nil, encoding) + rescue ArgumentError + # invalid HTML (too many attributes) - ignore + end + + if doc title = doc.at('title')&.inner_text # A horrible hack - YouTube uses `document.title` to populate the title diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb index 7edd62ff08a..b59ff8d1b20 100644 --- a/spec/lib/retrieve_title_spec.rb +++ b/spec/lib/retrieve_title_spec.rb @@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do ) expect(title).to eq("Video Title") end + + it "will not exception out for invalid html" do + attributes = (1..1000).map { |x| " attr#{x}='1' " }.join + title = RetrieveTitle.extract_title <<~HTML + <html> + <title>test + + + HTML + + expect(title).to eq(nil) + end end describe ".crawl" do