commit e8e1e1a7d09c614b57fac5070eb5c28822c948ba
parent 5a9951db80a5e9b9f2d5ad7ca1c6efebbd00e11f
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Fri, 12 Mar 2021 22:22:13 +0100
rework URI handling
- Parse the URI in a more correct way following the Gopher URI RFC 4266 and
   General URI RFC 3986 - Uniform Resource Identifier (URI): Generic Syntax.
- An URI fragment is not sent to the server anymore.
- A gopher type is now optional for an empty path or for example:
  "gopher://codemadness.org".
Also The use of strlcat() is removed and the code should now be more portable.
Diffstat:
| M | hurl.c |  |  | 175 | ++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- | 
1 file changed, 116 insertions(+), 59 deletions(-)
diff --git a/hurl.c b/hurl.c
@@ -28,12 +28,15 @@
 #define TLS_CA_CERT_FILE "/etc/ssl/cert.pem"
 #endif
 
-/* uri */
+/* URI */
 struct uri {
-	char proto[48];
+	char proto[48];     /* scheme including ":" or "://" */
+	char userinfo[256]; /* username [:password] */
 	char host[256];
-	char path[2048];
-	char port[6];     /* numeric port */
+	char port[6];       /* numeric port */
+	char path[1024];
+	char query[1024];
+	char fragment[1024];
 };
 
 char *argv0;
@@ -61,70 +64,115 @@ sighandler(int signo)
 }
 
 int
-parseuri(const char *s, struct uri *u)
+uri_parse(const char *s, struct uri *u)
 {
-	const char *p = s, *b;
-	char *endptr = NULL;
+	const char *p = s;
+	char *endptr;
 	size_t i;
-	unsigned long l;
+	long l;
 
-	u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
-	if (!*p)
-		return 0;
+	u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
+	u->path[0] = u->query[0] = u->fragment[0] = '\0';
 
-	/* protocol part */
-	for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
-		       *p == '+' || *p == '-' || *p == '.'); p++)
+	/* protocol-relative */
+	if (*p == '/' && *(p + 1) == '/') {
+		p += 2; /* skip "//" */
+		goto parseauth;
+	}
+
+	/* scheme / protocol part */
+	for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+		       *p == '+' || *p == '-' || *p == '.'; p++)
 		;
-	if (!strncmp(p, "://", 3)) {
+	/* scheme, except if empty and starts with ":" then it is a path */
+	if (*p == ':' && p != s) {
+		if (*(p + 1) == '/' && *(p + 2) == '/')
+			p += 3; /* skip "://" */
+		else
+			p++; /* skip ":" */
+
 		if ((size_t)(p - s) >= sizeof(u->proto))
 			return -1; /* protocol too long */
 		memcpy(u->proto, s, p - s);
 		u->proto[p - s] = '\0';
-		p += 3; /* skip "://" */
+
+		if (*(p - 1) != '/')
+			goto parsepath;
 	} else {
-		return -1; /* no protocol specified */
+		p = s; /* no scheme format, reset to start */
+		goto parsepath;
+	}
+
+parseauth:
+	/* userinfo (username:password) */
+	i = strcspn(p, "@/?#");
+	if (p[i] == '@') {
+		if (i >= sizeof(u->userinfo))
+			return -1; /* userinfo too long */
+		memcpy(u->userinfo, p, i);
+		u->userinfo[i] = '\0';
+		p += i + 1;
 	}
 
 	/* IPv6 address */
 	if (*p == '[') {
-		/* bracket not found or host too long */
-		if (!(b = strchr(p, ']')) || (size_t)(b - p) >= (ssize_t)sizeof(u->host))
+		/* bracket not found, host too short or too long */
+		i = strcspn(p, "]");
+		if (p[i] != ']' || i < 3)
 			return -1;
-		memcpy(u->host, p + 1, b - p - 1);
-		u->host[b - p - 1] = '\0';
-		p = b + 1;
+		i++; /* including "]" */
 	} else {
 		/* domain / host part, skip until port, path or end. */
-		if ((i = strcspn(p, ":/")) >= sizeof(u->host))
-			return -1; /* host too long */
-		memcpy(u->host, p, i);
-		u->host[i] = '\0';
-		p = &p[i];
+		i = strcspn(p, ":/?#");
 	}
+	if (i >= sizeof(u->host))
+		return -1; /* host too long */
+	memcpy(u->host, p, i);
+	u->host[i] = '\0';
+	p += i;
+
 	/* port */
 	if (*p == ':') {
-		if ((i = strcspn(++p, "/")) >= sizeof(u->port))
+		p++;
+		if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
 			return -1; /* port too long */
 		memcpy(u->port, p, i);
 		u->port[i] = '\0';
-		/* check for valid port: range 1 - 65535 */
+		/* check for valid port: range 1 - 65535, may be empty */
 		errno = 0;
-		l = strtoul(u->port, &endptr, 10);
-		if (errno || u->port[0] == '\0' || *endptr ||
-		    !l || l > 65535)
+		l = strtol(u->port, &endptr, 10);
+		if (i && (errno || *endptr || l <= 0 || l > 65535))
 			return -1;
-		p = &p[i];
+		p += i;
 	}
-	if (u->host[0]) {
-		p = &p[strspn(p, "/")];
-		memcpy(u->path, "/", 2);
-	} else {
-		return -1;
+
+parsepath:
+	/* path */
+	if ((i = strcspn(p, "?#")) >= sizeof(u->path))
+		return -1; /* path too long */
+	memcpy(u->path, p, i);
+	u->path[i] = '\0';
+	p += i;
+
+	/* query */
+	if (*p == '?') {
+		p++;
+		if ((i = strcspn(p, "#")) >= sizeof(u->query))
+			return -1; /* query too long */
+		memcpy(u->query, p, i);
+		u->query[i] = '\0';
+		p += i;
 	}
-	/* treat truncation as an error */
-	if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
-		return -1;
+
+	/* fragment */
+	if (*p == '#') {
+		p++;
+		if ((i = strlen(p)) >= sizeof(u->fragment))
+			return -1; /* fragment too long */
+		memcpy(u->fragment, p, i);
+		u->fragment[i] = '\0';
+	}
+
 	return 0;
 }
 
@@ -206,11 +254,14 @@ https_request(void)
 
 	/* create and send HTTP header */
 	r = snprintf(buf, sizeof(buf),
-		"GET %s HTTP/1.0\r\n"
+		"GET %s%s%s HTTP/1.0\r\n"
 		"Host: %s%s%s\r\n"
 		"Connection: close\r\n"
 		"%s%s"
-		"\r\n", u.path, u.host,
+		"\r\n",
+		u.path[0] ? u.path : "/",
+		u.query[0] ? "?" : "", u.query,
+		u.host,
 		stdport ? "" : ":",
 		stdport ? "" : u.port,
 		config_headers, config_headers[0] ? "\r\n" : "");
@@ -334,11 +385,14 @@ http_request(void)
 
 	/* create and send HTTP header */
 	r = snprintf(buf, sizeof(buf),
-		"GET %s HTTP/1.0\r\n"
+		"GET %s%s%s HTTP/1.0\r\n"
 		"Host: %s%s%s\r\n"
 		"Connection: close\r\n"
 		"%s%s"
-		"\r\n", u.path, u.host,
+		"\r\n",
+		u.path[0] ? u.path : "/",
+		u.query[0] ? "?" : "", u.query,
+		u.host,
 		stdport ? "" : ":",
 		stdport ? "" : u.port,
 		config_headers, config_headers[0] ? "\r\n" : "");
@@ -427,7 +481,7 @@ int
 gopher_request(void)
 {
 	char buf[READ_BUF_SIZ], *p;
-	const char *errstr;
+	const char *errstr, *path;
 	size_t len = 0;
 	ssize_t r;
 	int fd = -1, ret = 1;
@@ -440,8 +494,13 @@ gopher_request(void)
 	if (pledge("stdio", NULL) == -1)
 		err(1, "pledge");
 
-	/* create and send path, skip type part */
-	r = snprintf(buf, sizeof(buf), "%s\r\n", u.path + 2);
+	/* create and send path, skip type part, empty path is allowed,
+	   see RFC 4266 The gopher URI Scheme - section 2.1 */
+	path = u.path;
+	if (*path == '/' && *path++)
+		path++;
+	r = snprintf(buf, sizeof(buf), "%s%s%s\r\n",
+		path, u.query[0] ? "?" : "", u.query);
 	if (r < 0 || (size_t)r >= sizeof(buf)) {
 		fprintf(stderr, "not writing header because it is truncated");
 		goto err;
@@ -623,8 +682,10 @@ main(int argc, char **argv)
 		usage();
 
 	url = argv[0];
-	if (parseuri(url, &u) == -1)
-		errx(1, "invalid url: %s", url);
+	if (uri_parse(url, &u) == -1)
+		errx(1, "invalid URL: %s", url);
+	if (u.userinfo[0])
+		errx(1, "userinfo field not supported in the URL: %s", url);
 
 	if (config_timeout > 0) {
 		signal(SIGALRM, sighandler);
@@ -632,7 +693,7 @@ main(int argc, char **argv)
 			err(1, "alarm");
 	}
 
-	if (!strcmp(u.proto, "https")) {
+	if (!strcmp(u.proto, "https://")) {
 		if (tls_init())
 			errx(1, "tls_init failed");
 		if (!(tls_config = tls_config_new()))
@@ -643,22 +704,18 @@ main(int argc, char **argv)
 				errx(1, "tls set ciphers failed: %s",
 				     tls_config_error(tls_config));
 		}
-		if (!u.port[0] && !strcmp(u.proto, "https"))
+		if (!u.port[0])
 			memcpy(u.port, "443", 4);
 		statuscode = https_request();
-	} else if (!strcmp(u.proto, "http")) {
+	} else if (!strcmp(u.proto, "http://")) {
 		if (!u.port[0])
 			memcpy(u.port, "80", 3);
 		statuscode = http_request();
-	} else if (!strcmp(u.proto, "gopher")) {
+	} else if (!strcmp(u.proto, "gopher://")) {
 		if (!u.port[0])
 			memcpy(u.port, "70", 3);
-
-		if (u.path[0] != '/' || u.path[1] == '\0')
-			errx(1, "must specify type");
-
 		statuscode = gopher_request();
-	} else if (!strcmp(u.proto, "gophers")) {
+	} else if (!strcmp(u.proto, "gophers://")) {
 		if (tls_init())
 			errx(1, "tls_init failed");
 		if (!(tls_config = tls_config_new()))