diff --git a/lib/onebox/engine/wikipedia_onebox.rb b/lib/onebox/engine/wikipedia_onebox.rb
index 084166b229e..5a23ec54d02 100644
--- a/lib/onebox/engine/wikipedia_onebox.rb
+++ b/lib/onebox/engine/wikipedia_onebox.rb
@@ -30,14 +30,18 @@ module Onebox
if m_url_hash.nil? # no hash found in url
paras = raw.search("p") # default get all the paras
else
- section_header_title = raw.xpath("//span[@id='#{CGI.unescape(m_url_hash_name)}']")
+ section_header_title =
+ raw.xpath(
+ "//*[@id=\"#{CGI.unescape(m_url_hash_name)}\"][self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]",
+ )
if section_header_title.empty?
paras = raw.search("p") # default get all the paras
else
section_title_text = section_header_title.inner_text
- section_header = section_header_title[0].parent # parent element of the section span element should be an
node
- cur_element = section_header
+
+ # Get .mw-heading which wraps the h* element
+ cur_element = section_header_title[0].parent
# p|text|div covers the general case. We assume presence of at least 1 P node. if section has no P node we may end up with a P node from the next section.
# div tag is commonly used as an assets wraper in an article section. often as the first element holding an image.
@@ -95,7 +99,7 @@ module Onebox
description: text,
}
- img = raw.css(".image img")
+ img = raw.css(".infobox-image img")
if img && img.size > 0
img.each do |i|
diff --git a/spec/fixtures/onebox/wikipedia.response b/spec/fixtures/onebox/wikipedia.response
index 3eb69cadd58..e2479c05b7d 100644
--- a/spec/fixtures/onebox/wikipedia.response
+++ b/spec/fixtures/onebox/wikipedia.response
@@ -1,253 +1,791 @@
-
+
-
-Billy Jack - Wikipedia, the free encyclopedia
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+Billy Jack - Wikipedia
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Jump to content
+