FIX: Wikipedia onebox images and sections (#31384)

Both article images and section extraction were
not working for wikipedia oneboxes, this commit
fixes both and updates our spec fixture responses
to use the new HTML
This commit is contained in:
Martin Brennan
2025-02-19 14:29:17 +10:00
committed by GitHub
parent 143a824449
commit a0e1a12161
4 changed files with 2092 additions and 991 deletions

View File

@ -30,14 +30,18 @@ module Onebox
if m_url_hash.nil? # no hash found in url
paras = raw.search("p") # default get all the paras
else
section_header_title = raw.xpath("//span[@id='#{CGI.unescape(m_url_hash_name)}']")
section_header_title =
raw.xpath(
"//*[@id=\"#{CGI.unescape(m_url_hash_name)}\"][self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]",
)
if section_header_title.empty?
paras = raw.search("p") # default get all the paras
else
section_title_text = section_header_title.inner_text
section_header = section_header_title[0].parent # parent element of the section span element should be an <h3> node
cur_element = section_header
# Get .mw-heading which wraps the h* element
cur_element = section_header_title[0].parent
# p|text|div covers the general case. We assume presence of at least 1 P node. if section has no P node we may end up with a P node from the next section.
# div tag is commonly used as an assets wraper in an article section. often as the first element holding an image.
@ -95,7 +99,7 @@ module Onebox
description: text,
}
img = raw.css(".image img")
img = raw.css(".infobox-image img")
if img && img.size > 0
img.each do |i|

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -21,7 +21,7 @@ RSpec.describe Onebox::Engine::WikipediaOnebox do
end
it "includes summary" do
expect(html).to include("Billy Jack is a 1971 action/drama")
expect(html).to include("Billy Jack is a 1971 American action drama independent film")
end
end
@ -34,9 +34,7 @@ RSpec.describe Onebox::Engine::WikipediaOnebox do
end
describe "url with url-encoded section hash" do
let(:wp_link) do
"https://fr.wikipedia.org/wiki/Th%C3%A9ologie#La_th%C3%A9ologie_selon_Aristote"
end
let(:wp_link) { "https://fr.wikipedia.org/wiki/Th%C3%A9ologie#L'ontoth%C3%A9ologie" }
before do
stub_request(:get, "https://fr.wikipedia.org/wiki/Th%C3%A9ologie").to_return(
@ -46,7 +44,7 @@ RSpec.describe Onebox::Engine::WikipediaOnebox do
end
it "includes summary" do
expect(html).to include("Le terme est repris par")
expect(html).to include("investigation rationnelle sur les substances divines")
end
end