feat(entry): url encode entry.link

This makes it possibly to use spaces and some reserved characters of URLs in post filenames.
author: sternenseemann <sternenseemann@systemli.org> 2020-08-12 14:44:00 +0200
committer: sternenseemann <sternenseemann@systemli.org> 2020-08-12 14:44:00 +0200
commit: a9fd16d86376e4d80a08728a7da44d4c653fa796 (patch)
tree: f8041394ac89d32c212007a190ee220d7331095f
parent: 56cee5404fae78b979a00609271b9528df1a8987 (diff)
3 files changed, 141 insertions, 2 deletions
diff --git a/cgiutil.c b/cgiutil.c
index 410ea62..606a10e 100644
--- a/cgiutil.c
+++ b/cgiutil.c
@@ -1,5 +1,7 @@
 #include <errno.h>
+#include <stdbool.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 void send_header(char key[], char val[]) {
@@ -43,3 +45,97 @@ int http_errno(int err) {
             return 500;
     }
 }
+
+char nibble_hex(short h) {
+    switch(h) {
+        case 0:
+        case 1:
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+        case 9:
+            return (h + 48);
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+            return (h + 55);
+        default:
+            return 0;
+    }
+}
+
+int urlencode_realloc(char **input, int size) {
+    if(*input == NULL || size <= 0) {
+        return -1;
+    }
+
+    int output_size = size;
+    char *output = malloc(output_size);
+    int output_pos = 0;
+
+    if(output == NULL) {
+        return -1;
+    }
+
+    for(int i = 0; i < size; i++) {
+        char c = *(*input + i);
+        bool needs_escape;
+        switch(c) {
+            // generic delimiters
+            // we assume we never need to escape '/'. This
+            // should hold since on unix filenames won't
+            // contain slashes and the basis for all URLs
+            // in sternenblog are actual files
+            case ':': case '?': case '#': case '[': case ']': case '@':
+            // sub delimiters
+            case '!': case '$': case '&': case '\'': case '(': case ')':
+            case '*': case '+': case ',': case ';': case '=':
+            // other characters to encode
+            case '%': case ' ':
+                needs_escape = 1;
+                break;
+            // in order to simplify the code we just assume
+            // everything else doesn't have to be encoded
+            //
+            // otherwise we'd need to be UTF-8 aware here
+            // and consider more than one byte at a time.
+            default:
+                needs_escape = 0;
+        }
+
+        int necessary_space = needs_escape ? 3 : 1;
+
+        if(output_pos + necessary_space >= output_size) {
+            output_size += necessary_space;
+            char *tmp = realloc(output, output_size);
+            if(tmp == NULL) {
+                free(output);
+                return -1;
+            } else {
+                output = tmp;
+            }
+        }
+
+        if(needs_escape) {
+            short a = (c & 0xf0) >> 4;
+            short b = c & 0x0f;
+            output[output_pos++] = '%';
+            output[output_pos++] = nibble_hex(a);
+            output[output_pos++] = nibble_hex(b);
+        } else {
+            output[output_pos++] = c;
+        }
+    }
+
+    free(*input);
+    *input = output;
+
+    return output_size;
+}
diff --git a/cgiutil.h b/cgiutil.h
index 1d388af..4701e83 100644
--- a/cgiutil.h
+++ b/cgiutil.h
@@ -51,3 +51,41 @@ char *http_status_line(int status);
  * @return HTTP error code
  */
 int http_errno(int err);
+
+/*!
+ * @brief Urlencode a given dynamically allocated string
+ *
+ * urlencode_realloc() receives a pointer to a pointer to
+ * a dynamically allocated string to encode plus its size
+ * including the null byte at the end.
+ *
+ * It then replaces every reserved character in the string
+ * except `/` with the appropriate percent encoding. If
+ * the size of the buffer is not enough, it uses `realloc()`
+ * to increase it.
+ *
+ * Note that the implementation of url encoding is not 100%
+ * correct, but should be good enough in the context of
+ * sternenblog. `/` is not encoded since on unix
+ * a slash should always a path delimiter and never part of
+ * a filename. Another limitation of the url encoding is
+ * that it only checks for a list of characters to encode
+ * instead of checking if the characters are unreserved
+ * and don't need to be encoded which would be more correct.
+ * The approach taken has the big advantage that we don't
+ * need to worry about UTF-8, which makes the implementation
+ * considerably simpler. As a consequence however it will
+ * be not aggressive enough in terms of encoding in some
+ * cases.
+ *
+ * On error -1 is returned. In such a case the original
+ * pointer remains intact, so you can either `free()` it
+ * or continue with the unencoded string.
+ *
+ * Otherwise it returns new size of the buffer.
+ *
+ * @param **input pointer to input string
+ * @param size size of input string including null byte
+ * @return -1 on error, else size of buffer
+ */
+int urlencode_realloc(char **input, int size);
diff --git a/entry.c b/entry.c
index b421042..bf2a808 100644
--- a/entry.c
+++ b/entry.c
@@ -125,8 +125,9 @@ int make_entry(const char *blog_dir, char *script_name, char *path_info, struct
     // don't depend on it starting with a slash
 
     size_t script_name_len = strlen(script_name);
+    size_t link_size = script_name_len + path_info_len + 1;
 
-    entry->link = malloc(sizeof(char) * (script_name_len + path_info_len + 1));
+    entry->link = malloc(sizeof(char) * link_size);
 
     if(script_name_len != 0) {
         memcpy(entry->link, script_name, script_name_len);
@@ -134,7 +135,11 @@ int make_entry(const char *blog_dir, char *script_name, char *path_info, struct
 
     memcpy(entry->link + script_name_len, path_info, path_info_len);
 
-    entry->link[path_info_len + script_name_len] = '\0';
+    entry->link[link_size - 1] = '\0';
+
+    if(urlencode_realloc(&entry->link, link_size) <= 0) {
+        return 500;
+    }
 
     return 200;
 }
author	sternenseemann <sternenseemann@systemli.org>	2020-08-12 14:44:00 +0200
committer	sternenseemann <sternenseemann@systemli.org>	2020-08-12 14:44:00 +0200
commit	a9fd16d86376e4d80a08728a7da44d4c653fa796 (patch)
tree	f8041394ac89d32c212007a190ee220d7331095f
parent	56cee5404fae78b979a00609271b9528df1a8987 (diff)