findUrl: be pickier about what can precede a URL

findUrl() was seeing strings like "You have 1 new message in
foo@example.com/Inbox" and finding the URL
"[http://]example.com/Inbox". Require that URLs either start at the
start of the string, or are preceded by whitespace or an open
paren/quote/etc.

(Since JS doesn't have look-behind assertions like perl does, we have
to actually match the URL-preceding character in the regex, and then
adjust the result findUrl returns accordingly.)

https://bugzilla.gnome.org/show_bug.cgi?id=636252
This commit is contained in:
Dan Winship 2011-04-13 09:40:28 -04:00
parent 563221698c
commit e2898bea5c

View File

@ -9,10 +9,12 @@ const Main = imports.ui.main;
// http://daringfireball.net/2010/07/improved_regex_for_matching_urls // http://daringfireball.net/2010/07/improved_regex_for_matching_urls
const _balancedParens = '\\((?:[^\\s()<>]+|(?:\\(?:[^\\s()<>]+\\)))*\\)'; const _balancedParens = '\\((?:[^\\s()<>]+|(?:\\(?:[^\\s()<>]+\\)))*\\)';
const _leadingJunk = '[\\s`(\\[{\'\\"<\u00AB\u201C\u2018]';
const _notTrailingJunk = '[^\\s`!()\\[\\]{};:\'\\".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019]'; const _notTrailingJunk = '[^\\s`!()\\[\\]{};:\'\\".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019]';
const _urlRegexp = new RegExp( const _urlRegexp = new RegExp(
'\\b(' + '(^|' + _leadingJunk + ')' +
'(' +
'(?:' + '(?:' +
'[a-z][\\w-]+://' + // scheme:// '[a-z][\\w-]+://' + // scheme://
'|' + '|' +
@ -43,7 +45,7 @@ const _urlRegexp = new RegExp(
function findUrls(str) { function findUrls(str) {
let res = [], match; let res = [], match;
while ((match = _urlRegexp.exec(str))) while ((match = _urlRegexp.exec(str)))
res.push({ url: match[0], pos: match.index }); res.push({ url: match[2], pos: match.index + match[1].length });
return res; return res;
} }