hurl

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit e8e1e1a7d09c614b57fac5070eb5c28822c948ba
parent 5a9951db80a5e9b9f2d5ad7ca1c6efebbd00e11f
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Fri, 12 Mar 2021 22:22:13 +0100

rework URI handling

- Parse the URI in a more correct way following the Gopher URI RFC 4266 and
   General URI RFC 3986 - Uniform Resource Identifier (URI): Generic Syntax.
- An URI fragment is not sent to the server anymore.
- A gopher type is now optional for an empty path or for example:
  "gopher://codemadness.org".

Also The use of strlcat() is removed and the code should now be more portable.

Diffstat:
Mhurl.c | 175++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
1 file changed, 116 insertions(+), 59 deletions(-)

diff --git a/hurl.c b/hurl.c @@ -28,12 +28,15 @@ #define TLS_CA_CERT_FILE "/etc/ssl/cert.pem" #endif -/* uri */ +/* URI */ struct uri { - char proto[48]; + char proto[48]; /* scheme including ":" or "://" */ + char userinfo[256]; /* username [:password] */ char host[256]; - char path[2048]; - char port[6]; /* numeric port */ + char port[6]; /* numeric port */ + char path[1024]; + char query[1024]; + char fragment[1024]; }; char *argv0; @@ -61,70 +64,115 @@ sighandler(int signo) } int -parseuri(const char *s, struct uri *u) +uri_parse(const char *s, struct uri *u) { - const char *p = s, *b; - char *endptr = NULL; + const char *p = s; + char *endptr; size_t i; - unsigned long l; + long l; - u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0'; - if (!*p) - return 0; + u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0'; + u->path[0] = u->query[0] = u->fragment[0] = '\0'; - /* protocol part */ - for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || - *p == '+' || *p == '-' || *p == '.'); p++) + /* protocol-relative */ + if (*p == '/' && *(p + 1) == '/') { + p += 2; /* skip "//" */ + goto parseauth; + } + + /* scheme / protocol part */ + for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || + *p == '+' || *p == '-' || *p == '.'; p++) ; - if (!strncmp(p, "://", 3)) { + /* scheme, except if empty and starts with ":" then it is a path */ + if (*p == ':' && p != s) { + if (*(p + 1) == '/' && *(p + 2) == '/') + p += 3; /* skip "://" */ + else + p++; /* skip ":" */ + if ((size_t)(p - s) >= sizeof(u->proto)) return -1; /* protocol too long */ memcpy(u->proto, s, p - s); u->proto[p - s] = '\0'; - p += 3; /* skip "://" */ + + if (*(p - 1) != '/') + goto parsepath; } else { - return -1; /* no protocol specified */ + p = s; /* no scheme format, reset to start */ + goto parsepath; + } + +parseauth: + /* userinfo (username:password) */ + i = strcspn(p, "@/?#"); + if (p[i] == '@') { + if (i >= sizeof(u->userinfo)) + return -1; /* userinfo too long */ + memcpy(u->userinfo, p, i); + u->userinfo[i] = '\0'; + p += i + 1; } /* IPv6 address */ if (*p == '[') { - /* bracket not found or host too long */ - if (!(b = strchr(p, ']')) || (size_t)(b - p) >= (ssize_t)sizeof(u->host)) + /* bracket not found, host too short or too long */ + i = strcspn(p, "]"); + if (p[i] != ']' || i < 3) return -1; - memcpy(u->host, p + 1, b - p - 1); - u->host[b - p - 1] = '\0'; - p = b + 1; + i++; /* including "]" */ } else { /* domain / host part, skip until port, path or end. */ - if ((i = strcspn(p, ":/")) >= sizeof(u->host)) - return -1; /* host too long */ - memcpy(u->host, p, i); - u->host[i] = '\0'; - p = &p[i]; + i = strcspn(p, ":/?#"); } + if (i >= sizeof(u->host)) + return -1; /* host too long */ + memcpy(u->host, p, i); + u->host[i] = '\0'; + p += i; + /* port */ if (*p == ':') { - if ((i = strcspn(++p, "/")) >= sizeof(u->port)) + p++; + if ((i = strcspn(p, "/?#")) >= sizeof(u->port)) return -1; /* port too long */ memcpy(u->port, p, i); u->port[i] = '\0'; - /* check for valid port: range 1 - 65535 */ + /* check for valid port: range 1 - 65535, may be empty */ errno = 0; - l = strtoul(u->port, &endptr, 10); - if (errno || u->port[0] == '\0' || *endptr || - !l || l > 65535) + l = strtol(u->port, &endptr, 10); + if (i && (errno || *endptr || l <= 0 || l > 65535)) return -1; - p = &p[i]; + p += i; } - if (u->host[0]) { - p = &p[strspn(p, "/")]; - memcpy(u->path, "/", 2); - } else { - return -1; + +parsepath: + /* path */ + if ((i = strcspn(p, "?#")) >= sizeof(u->path)) + return -1; /* path too long */ + memcpy(u->path, p, i); + u->path[i] = '\0'; + p += i; + + /* query */ + if (*p == '?') { + p++; + if ((i = strcspn(p, "#")) >= sizeof(u->query)) + return -1; /* query too long */ + memcpy(u->query, p, i); + u->query[i] = '\0'; + p += i; } - /* treat truncation as an error */ - if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path)) - return -1; + + /* fragment */ + if (*p == '#') { + p++; + if ((i = strlen(p)) >= sizeof(u->fragment)) + return -1; /* fragment too long */ + memcpy(u->fragment, p, i); + u->fragment[i] = '\0'; + } + return 0; } @@ -206,11 +254,14 @@ https_request(void) /* create and send HTTP header */ r = snprintf(buf, sizeof(buf), - "GET %s HTTP/1.0\r\n" + "GET %s%s%s HTTP/1.0\r\n" "Host: %s%s%s\r\n" "Connection: close\r\n" "%s%s" - "\r\n", u.path, u.host, + "\r\n", + u.path[0] ? u.path : "/", + u.query[0] ? "?" : "", u.query, + u.host, stdport ? "" : ":", stdport ? "" : u.port, config_headers, config_headers[0] ? "\r\n" : ""); @@ -334,11 +385,14 @@ http_request(void) /* create and send HTTP header */ r = snprintf(buf, sizeof(buf), - "GET %s HTTP/1.0\r\n" + "GET %s%s%s HTTP/1.0\r\n" "Host: %s%s%s\r\n" "Connection: close\r\n" "%s%s" - "\r\n", u.path, u.host, + "\r\n", + u.path[0] ? u.path : "/", + u.query[0] ? "?" : "", u.query, + u.host, stdport ? "" : ":", stdport ? "" : u.port, config_headers, config_headers[0] ? "\r\n" : ""); @@ -427,7 +481,7 @@ int gopher_request(void) { char buf[READ_BUF_SIZ], *p; - const char *errstr; + const char *errstr, *path; size_t len = 0; ssize_t r; int fd = -1, ret = 1; @@ -440,8 +494,13 @@ gopher_request(void) if (pledge("stdio", NULL) == -1) err(1, "pledge"); - /* create and send path, skip type part */ - r = snprintf(buf, sizeof(buf), "%s\r\n", u.path + 2); + /* create and send path, skip type part, empty path is allowed, + see RFC 4266 The gopher URI Scheme - section 2.1 */ + path = u.path; + if (*path == '/' && *path++) + path++; + r = snprintf(buf, sizeof(buf), "%s%s%s\r\n", + path, u.query[0] ? "?" : "", u.query); if (r < 0 || (size_t)r >= sizeof(buf)) { fprintf(stderr, "not writing header because it is truncated"); goto err; @@ -623,8 +682,10 @@ main(int argc, char **argv) usage(); url = argv[0]; - if (parseuri(url, &u) == -1) - errx(1, "invalid url: %s", url); + if (uri_parse(url, &u) == -1) + errx(1, "invalid URL: %s", url); + if (u.userinfo[0]) + errx(1, "userinfo field not supported in the URL: %s", url); if (config_timeout > 0) { signal(SIGALRM, sighandler); @@ -632,7 +693,7 @@ main(int argc, char **argv) err(1, "alarm"); } - if (!strcmp(u.proto, "https")) { + if (!strcmp(u.proto, "https://")) { if (tls_init()) errx(1, "tls_init failed"); if (!(tls_config = tls_config_new())) @@ -643,22 +704,18 @@ main(int argc, char **argv) errx(1, "tls set ciphers failed: %s", tls_config_error(tls_config)); } - if (!u.port[0] && !strcmp(u.proto, "https")) + if (!u.port[0]) memcpy(u.port, "443", 4); statuscode = https_request(); - } else if (!strcmp(u.proto, "http")) { + } else if (!strcmp(u.proto, "http://")) { if (!u.port[0]) memcpy(u.port, "80", 3); statuscode = http_request(); - } else if (!strcmp(u.proto, "gopher")) { + } else if (!strcmp(u.proto, "gopher://")) { if (!u.port[0]) memcpy(u.port, "70", 3); - - if (u.path[0] != '/' || u.path[1] == '\0') - errx(1, "must specify type"); - statuscode = gopher_request(); - } else if (!strcmp(u.proto, "gophers")) { + } else if (!strcmp(u.proto, "gophers://")) { if (tls_init()) errx(1, "tls_init failed"); if (!(tls_config = tls_config_new()))