Incorporate better regex from review

Signed-off-by: RMidhunSuresh <rmidhunsuresh@gmail.com>
This commit is contained in:
RMidhunSuresh 2021-05-12 16:15:30 +05:30
parent 00bcdbab37
commit 851e8d34a4

View File

@ -4,24 +4,20 @@ meaning that any escapes (\) must also
be escaped. be escaped.
*/ */
const scheme = "(?:https|http|ftp):\\/\\/"; const scheme = "(?:https|http|ftp):\\/\\/";
const host = "[a-zA-Z0-9:.\\[\\]-]"; const noSpaceNorPunctuation = "[^\\s.,?!]";
const hostCharacter = "[a-zA-Z0-9:.\\[\\]-]";
/* /*
A URL containing path (/) or fragment (#) component Using non-consuming group here to combine two criteria for the last character.
is allowed to end with any character which is not See point 1 below.
space nor punctuation. The ending character may be
non-ASCII.
*/ */
const end = "[^\\s.,?!]"; const host = `${hostCharacter}*(?=${hostCharacter})${noSpaceNorPunctuation}`;
const additional = `[\\/#][^\\s]*${end}`;
/* /*
Similarly, a URL not containing path or fragment must Use sub groups so we accept just / or #; but if anything comes after it,
also end with a character that is not space nor punctuation. it should not end with punctuation or space.
Additionally, the ending character must also be ASCII.
*/ */
const nonASCII = "\\u{80}-\\u{10ffff}"; const pathOrFragment = `(?:[\\/#](?:[^\\s]*${noSpaceNorPunctuation})?)`;
const endASCII = `[^\\s${nonASCII}.,?!]`;
/* /*
Things to keep in mind: Things to keep in mind:
@ -30,7 +26,8 @@ Things to keep in mind:
https://matrix.org/<smiley> - valid https://matrix.org/<smiley> - valid
https://matrix.org<smiley> - invalid https://matrix.org<smiley> - invalid
2. Do not treat punctuation at the end as a part of the URL (.,?!) 2. Do not treat punctuation at the end as a part of the URL (.,?!)
3. Path/fragment is optional.
*/ */
const urlRegex = `${scheme}${host}+(?:${additional}|${endASCII})`; const urlRegex = `${scheme}${host}${pathOrFragment}?`;
export const regex = new RegExp(urlRegex, "gui"); export const regex = new RegExp(urlRegex, "gi");