import re URL_REGEX = re.compile( r"(?:^|(?<![\w\/\.]))" # protocol identifier # r"(?:(?:https?|ftp)://)" <-- alt? r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))" # user:pass authentication r"(?:\S+(?::\S*)?@)?" r"(?:" # IP address exclusion # private & local networks r"(?!(?:10|127)(?:\.\d{1,3}){3})" r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" # IP address dotted notation octets # excludes loopback network 0.0.0.0 # excludes reserved space >= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)" # domain name r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*" # TLD identifier r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:\/[^\)\]\}\s]*)?", # r"(?:$|(?![\w?!+&\/\)]))", # @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful? # But I made sure that it does not include ), ] and } in the URL. flags=re.UNICODE | re.IGNORECASE, )