diff options
author | Diogo Sousa <diogogsousa@gmail.com> | 2013-06-16 15:36:52 +0100 |
---|---|---|
committer | Diogo Sousa <diogogsousa@gmail.com> | 2013-06-16 15:36:52 +0100 |
commit | 805b33552bcf0a70abe57e7d508731353ec8e385 (patch) | |
tree | 3468b0a34159a5b21f959ebf7f9be3dcf8ac911c | |
parent | 6bc05a8bc835d38e7e245bb70ee748809f5277b2 (diff) |
Rework url matching to make it easier to add schemes.
The new way allows great control of what a url is composed of. Added a bunch of new schemes.
-rw-r--r-- | src/common/url.c | 161 |
1 files changed, 112 insertions, 49 deletions
diff --git a/src/common/url.c b/src/common/url.c index 84c112d1..def77628 100644 --- a/src/common/url.c +++ b/src/common/url.c @@ -314,8 +314,8 @@ do_an_re(const char *word,int *start, int *end, int *type) } func_t; func_t funcs[] = { - { re_email, WORD_EMAIL }, { re_url, WORD_URL }, + { re_email, WORD_EMAIL }, { re_channel, WORD_CHANNEL }, { re_host, WORD_HOST }, { re_path, WORD_PATH }, @@ -360,7 +360,7 @@ make_re(char *grist, char *type) GRegex *ret; GError *err = NULL; - ret = g_regex_new (grist, G_REGEX_CASELESS + G_REGEX_OPTIMIZE, 0, &err); + ret = g_regex_new (grist, G_REGEX_CASELESS | G_REGEX_OPTIMIZE, 0, &err); g_free (grist); return ret; } @@ -389,60 +389,123 @@ re_host (void) #define LPAR "\\(" #define RPAR "\\)" #define NOPARENS "[^() \t]*" +#define PATH \ + "(" \ + "(" LPAR NOPARENS RPAR ")" \ + "|" \ + "(" NOPARENS ")" \ + ")*" /* Zero or more occurrences of either of these */ \ + "(?<![.,?!\\]])" /* Not allowed to end with these */ +#define USERINFO "([-a-z0-9._~%]+@)" + +/* Flags used to describe URIs (RFC 3986) + * + * Bellow is an example of what the flags match. + * + * URI_AUTHORITY - http://example.org:80/foo/bar + * ^^^^^^^^^^^^^^^^ + * URI_USERINFO/URI_OPT_USERINFO - http://user@example.org:80/foo/bar + * ^^^^^ + * URI_PATH - http://example.org:80/foo/bar + * ^^^^^^^^ + */ +#define URI_AUTHORITY (1 << 0) +#define URI_OPT_USERINFO (1 << 1) +#define URI_USERINFO (1 << 2) +#define URI_PATH (1 << 3) -char *prefix[] = { - "irc\\.", - "ftp\\.", - "www\\.", - "irc://", - "ircs://", - "ftp://", - "http://", - "https://", - "file://", - "rtsp://", - NULL +struct +{ + const char *scheme; /* scheme name. e.g. http */ + const char *path_sep; /* string that begins the path */ + int flags; /* see above (flag definitions) */ +} uri[] = { + { "irc", "/", URI_AUTHORITY | URI_PATH }, + { "ircs", "/", URI_AUTHORITY | URI_PATH }, + { "rtsp", "/", URI_AUTHORITY | URI_PATH }, + { "feed", "/", URI_AUTHORITY | URI_PATH }, + { "teamspeak", "?", URI_AUTHORITY | URI_PATH }, + { "ftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "sftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "ftps", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "http", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "https", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "cvs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "svn", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "git", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "rsync", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "mumble", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "ventrilo", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "xmpp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "file", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "h323", ";", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "imap", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "pop", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "nfs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "smb", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH }, + { "ssh", "", URI_AUTHORITY | URI_OPT_USERINFO }, + { "sip", "", URI_AUTHORITY | URI_USERINFO }, + { "sips", "", URI_AUTHORITY | URI_USERINFO }, + { "magnet", "?", URI_PATH }, + { "mailto", "", URI_PATH }, + { "bitcoin", "", URI_PATH }, + { "gtalk", "", URI_PATH }, + { "steam", "", URI_PATH }, + { NULL, '\0', 0} }; static GRegex * re_url (void) { - static GRegex *url_ret; + static GRegex *url_ret = NULL; + GString *grist_gstr; char *grist; - char *scheme; + int i; if (url_ret) return url_ret; - scheme = g_strjoinv ("|", prefix); - grist = g_strdup_printf ( - "(" /* URL or HOST */ - "(" - SCHEME HOST OPT_PORT - "(" /* Optional "/path?query_string#fragment_id" */ - "/" /* Must start with slash */ - "(" - "(" LPAR NOPARENS RPAR ")" - "|" - "(" NOPARENS ")" - ")*" /* Zero or more occurrences of either of these */ - "(?<![.,?!\\]])" /* Not allowed to end with these */ - ")?" /* Zero or one of this /path?query_string#fragment_id thing */ - ")|(" - HOST OPT_PORT "/" - "(" /* Optional "path?query_string#fragment_id" */ - "(" - "(" LPAR NOPARENS RPAR ")" - "|" - "(" NOPARENS ")" - ")*" /* Zero or more occurrences of either of these */ - "(?<![.,?!\\]])" /* Not allowed to end with these */ - ")?" /* Zero or one of this /path?query_string#fragment_id thing */ - ")" - ")" - , scheme - ); + grist_gstr = g_string_new (NULL); + + /* Add regex "host/path", representing a "schemeless" url */ + g_string_append (grist_gstr, "(" HOST OPT_PORT "/" "(" PATH ")?" ")"); + + for (i = 0; uri[i].scheme; i++) + { + g_string_append (grist_gstr, "|("); + g_string_append_printf (grist_gstr, "%s:", uri[i].scheme); + + if (uri[i].flags & URI_AUTHORITY) + g_string_append (grist_gstr, "//"); + + if (uri[i].flags & URI_USERINFO) + g_string_append (grist_gstr, USERINFO); + else if (uri[i].flags & URI_OPT_USERINFO) + g_string_append (grist_gstr, USERINFO "?"); + + if (uri[i].flags & URI_AUTHORITY) + g_string_append (grist_gstr, HOST OPT_PORT); + + if (uri[i].flags & URI_PATH) + { + char *sep_escaped; + + sep_escaped = g_regex_escape_string (uri[i].path_sep, + strlen(uri[i].path_sep)); + + g_string_append_printf(grist_gstr, "(" "%s" PATH ")?", + sep_escaped); + + g_free(sep_escaped); + } + + + g_string_append(grist_gstr, ")"); + } + + grist = g_string_free (grist_gstr, FALSE); + url_ret = make_re (grist, "re_url"); - g_free (scheme); + return url_ret; } @@ -525,10 +588,10 @@ re_channel (void) /* PATH description --- */ #ifdef WIN32 /* Windows path can be .\ ..\ or C: D: etc */ -#define PATH "^(\\.{1,2}\\\\|[a-z]:).*" +#define FS_PATH "^(\\.{1,2}\\\\|[a-z]:).*" #else /* Linux path can be / or ./ or ../ etc */ -#define PATH "^(/|\\./|\\.\\./).*" +#define FS_PATH "^(/|\\./|\\.\\./).*" #endif static GRegex * @@ -540,8 +603,8 @@ re_path (void) if (path_ret) return path_ret; grist = g_strdup_printf ( - "(" /* PATH */ - PATH + "(" /* FS_PATH */ + FS_PATH ")" ); path_ret = make_re (grist, "re_path"); |