Add proper support for IDN links.

This commit is contained in:
Jason Parker 2024-10-08 17:46:01 -04:00
parent 96ddf1d482
commit 65f6fba745

View File

@ -2,11 +2,27 @@
module Extractor module Extractor
MAX_DOMAIN_LENGTH = 253 MAX_DOMAIN_LENGTH = 253
MAX_URL_LENGTH = 4096
extend Twitter::TwitterText::Extractor extend Twitter::TwitterText::Extractor
module_function module_function
def is_valid_domain(url_length, domain, protocol)
begin
raise ArgumentError.new("invalid empty domain") unless domain
original_domain_length = domain.length
encoded_domain = IDN::Idna.toASCII(domain, IDN::Idna::ALLOW_UNASSIGNED)
updated_domain_length = encoded_domain.length
url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
url_length += URL_PROTOCOL_LENGTH unless protocol
url_length <= MAX_URL_LENGTH
# On error don't consider this a valid domain.
rescue Exception
return false
end
end
def extract_entities_with_indices(text, options = {}, &block) def extract_entities_with_indices(text, options = {}, &block)
entities = extract_urls_with_indices(text, options) + entities = extract_urls_with_indices(text, options) +
extract_hashtags_with_indices(text, check_url_overlap: false) + extract_hashtags_with_indices(text, check_url_overlap: false) +