mirror of
https://github.com/discourse/discourse.git
synced 2025-05-31 01:47:46 +08:00
FIX: ignore malformed HTML for title extraction (#18040)
Certain HTML can be rejected by nokogumbo, specifically cases where there are enormous amounts of attributes This ensures that malformed HTML is simply skipped instead of leaking out an exception and terminating downstream processes.
This commit is contained in:
@ -18,8 +18,15 @@ module RetrieveTitle
|
||||
if html =~ /<title>/ && html !~ /<\/title>/
|
||||
return nil
|
||||
end
|
||||
if doc = Nokogiri::HTML5(html, nil, encoding)
|
||||
|
||||
doc = nil
|
||||
begin
|
||||
doc = Nokogiri::HTML5(html, nil, encoding)
|
||||
rescue ArgumentError
|
||||
# invalid HTML (too many attributes) - ignore
|
||||
end
|
||||
|
||||
if doc
|
||||
title = doc.at('title')&.inner_text
|
||||
|
||||
# A horrible hack - YouTube uses `document.title` to populate the title
|
||||
|
Reference in New Issue
Block a user