mirror of
https://github.com/mastodon/mastodon.git
synced 2024-12-29 14:35:06 +01:00
LinkDetailsExtractor adjustments (#31357)
This commit is contained in:
parent
68c7782940
commit
0518613dd7
@ -157,7 +157,7 @@ class LinkDetailsExtractor
|
|||||||
end
|
end
|
||||||
|
|
||||||
def title
|
def title
|
||||||
html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip
|
html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip
|
||||||
end
|
end
|
||||||
|
|
||||||
def description
|
def description
|
||||||
@ -205,11 +205,11 @@ class LinkDetailsExtractor
|
|||||||
end
|
end
|
||||||
|
|
||||||
def language
|
def language
|
||||||
valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
|
valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang'))
|
||||||
end
|
end
|
||||||
|
|
||||||
def icon
|
def icon
|
||||||
valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
|
valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon'))
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
@ -237,18 +237,20 @@ class LinkDetailsExtractor
|
|||||||
end
|
end
|
||||||
|
|
||||||
def link_tag(name)
|
def link_tag(name)
|
||||||
document.xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler).pick('href')
|
head.at_xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler)&.attr('href')
|
||||||
end
|
end
|
||||||
|
|
||||||
def opengraph_tag(name)
|
def opengraph_tag(name)
|
||||||
document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
|
head.at_xpath("//meta[nokogiri:casecmp(@property, '#{name}') or nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
|
||||||
end
|
end
|
||||||
|
|
||||||
def meta_tag(name)
|
def meta_tag(name)
|
||||||
document.xpath("//meta[@name=\"#{name}\"]").pick('content')
|
head.at_xpath("//meta[nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
|
||||||
end
|
end
|
||||||
|
|
||||||
def structured_data
|
def structured_data
|
||||||
|
return @structured_data if defined?(@structured_data)
|
||||||
|
|
||||||
# Some publications have more than one JSON-LD definition on the page,
|
# Some publications have more than one JSON-LD definition on the page,
|
||||||
# and some of those definitions aren't valid JSON either, so we have
|
# and some of those definitions aren't valid JSON either, so we have
|
||||||
# to loop through here until we find something that is the right type
|
# to loop through here until we find something that is the right type
|
||||||
@ -273,6 +275,10 @@ class LinkDetailsExtractor
|
|||||||
@document ||= detect_encoding_and_parse_document
|
@document ||= detect_encoding_and_parse_document
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def head
|
||||||
|
@head ||= document.at_xpath('/html/head')
|
||||||
|
end
|
||||||
|
|
||||||
def detect_encoding_and_parse_document
|
def detect_encoding_and_parse_document
|
||||||
html = nil
|
html = nil
|
||||||
encoding = nil
|
encoding = nil
|
||||||
|
@ -8,5 +8,9 @@ class NokogiriHandler
|
|||||||
def link_rel_include(token_list, token)
|
def link_rel_include(token_list, token)
|
||||||
token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase)
|
token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def casecmp(str1, str2)
|
||||||
|
str1.to_s.casecmp?(str2.to_s)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -49,7 +49,8 @@ RSpec.describe LinkDetailsExtractor do
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<title>Man bites dog</title>
|
<title>Man bites dog</title>
|
||||||
<meta name="description" content="A dog's tale">
|
<meta name="descripTION" content="A dog's tale">
|
||||||
|
<link rel="pretty IcoN" href="/favicon.ico">
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
HTML
|
HTML
|
||||||
@ -59,7 +60,8 @@ RSpec.describe LinkDetailsExtractor do
|
|||||||
.to have_attributes(
|
.to have_attributes(
|
||||||
title: eq('Man bites dog'),
|
title: eq('Man bites dog'),
|
||||||
description: eq("A dog's tale"),
|
description: eq("A dog's tale"),
|
||||||
language: eq('en')
|
language: eq('en'),
|
||||||
|
icon: eq('https://example.com/favicon.ico')
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -256,7 +258,7 @@ RSpec.describe LinkDetailsExtractor do
|
|||||||
<head>
|
<head>
|
||||||
<meta property="og:url" content="https://example.com/dog.html">
|
<meta property="og:url" content="https://example.com/dog.html">
|
||||||
<meta property="og:title" content="Man bites dog">
|
<meta property="og:title" content="Man bites dog">
|
||||||
<meta property="og:description" content="A dog's tale">
|
<meta property="OG:description" content="A dog's tale">
|
||||||
<meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
|
<meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
|
||||||
<meta property="og:author" content="Charlie Brown">
|
<meta property="og:author" content="Charlie Brown">
|
||||||
<meta property="og:locale" content="en">
|
<meta property="og:locale" content="en">
|
||||||
|
Loading…
Reference in New Issue
Block a user