From 5d771762e3cce5aaebb3e8bc2ab7f582ed6032ef Mon Sep 17 00:00:00 2001 From: "L. Pereira" Date: Mon, 13 May 2024 20:25:26 -0700 Subject: [PATCH] Use a branchless binary search for extension<->MIME-Types This uses the excellent branchless binary search by Orlon Peters[1], based on the Malte Skarupke's version[2] of Leonard E Shar's version of a binary search. It's a fascinating implementation that ends up becoming just a very tight loop using CMOV instructions, and finishing with another CMOV instruction. It's so clean that I removed the fast path using STRING_SWITCH as that's not necessary anymore. A nice side effect of this change is that the string "application/octet-stream" doesn't appear in the binary anymore (except in the debug version due to assertions), as it's now part of the compressed blob that mimegen generates. A good segue to this commit would be porting the other usage of bsearch() to this implementation. [1] https://orlp.net/blog/bitwise-binary-search/ [2] https://probablydance.com/2023/04/27/beautiful-branchless-binary-search/ --- src/bin/tools/mimegen.c | 59 ++++++++++++++++++++++------------------- src/lib/lwan-tables.c | 47 ++++++++++++-------------------- 2 files changed, 48 insertions(+), 58 deletions(-) diff --git a/src/bin/tools/mimegen.c b/src/bin/tools/mimegen.c index 354ae85de..68291d049 100644 --- a/src/bin/tools/mimegen.c +++ b/src/bin/tools/mimegen.c @@ -161,29 +161,6 @@ static char *compress_output(const struct output *output, size_t *outlen) return compressed; } -static bool is_builtin_ext(const char *ext) -{ - /* STRING_SWITCH_L() is not used here to not bring in lwan.h */ - /* FIXME: maybe use an X-macro to keep in sync with lwan-tables.c? */ - if (strcaseequal_neutral(ext, "css")) - return true; - if (strcaseequal_neutral(ext, "gif")) - return true; - if (strcaseequal_neutral(ext, "htm")) - return true; - if (strcaseequal_neutral(ext, "html")) - return true; - if (strcaseequal_neutral(ext, "jpg")) - return true; - if (strcaseequal_neutral(ext, "js")) - return true; - if (strcaseequal_neutral(ext, "png")) - return true; - if (strcaseequal_neutral(ext, "txt")) - return true; - return false; -} - int main(int argc, char *argv[]) { /* 32k is sufficient for the provided mime.types, but we can reallocate @@ -258,11 +235,6 @@ int main(int argc, char *argv[]) ext[8] = '\0'; } - /* Lwan has a fast-path for some common extensions, so don't bundle them - * in this table if not really needed. */ - if (is_builtin_ext(ext)) - continue; - k = strdup(ext); v = strdup(mime_type); @@ -286,6 +258,22 @@ int main(int argc, char *argv[]) } } + { + char *k = strdup("bin"); + char *v = strdup("application/octet-stream"); + if (!k || !v) { + fprintf(stderr, "Could not allocate memory\n"); + fclose(fp); + return 1; + } + int r = hash_add_unique(ext_mime, k, v); + if (r != 0 && r != -EEXIST) { + fprintf(stderr, "Could not add fallback mime entry\n"); + fclose(fp); + return 1; + } + } + /* Get sorted list of extensions. */ exts = calloc(hash_get_count(ext_mime), sizeof(char *)); if (!exts) { @@ -305,6 +293,7 @@ int main(int argc, char *argv[]) fclose(fp); return 1; } + ssize_t bin_index = -1; for (i = 0; i < hash_get_count(ext_mime); i++) { uint64_t ext_lower = 0; @@ -322,6 +311,9 @@ int main(int argc, char *argv[]) fclose(fp); return 1; } + + if (bin_index < 0 && streq(exts[i], "bin")) + bin_index = (ssize_t)i; } for (i = 0; i < hash_get_count(ext_mime); i++) { if (output_append(&output, hash_find(ext_mime, exts[i])) < 0) { @@ -331,6 +323,12 @@ int main(int argc, char *argv[]) } } + if (bin_index < 0) { + fprintf(stderr, "Could not find fallback item after sorting!\n"); + fclose(fp); + return 1; + } + /* Compress blob. */ compressed = compress_output(&output, &compressed_size); if (!compressed) { @@ -349,10 +347,15 @@ int main(int argc, char *argv[]) #else printf("/* Compressed with zlib (deflate) */\n"); #endif + + unsigned int entries_floor = 1u << (31 - __builtin_clz(hash_get_count(ext_mime))); + printf("#pragma once\n"); printf("#define MIME_UNCOMPRESSED_LEN %zu\n", output.used); printf("#define MIME_COMPRESSED_LEN %lu\n", compressed_size); printf("#define MIME_ENTRIES %d\n", hash_get_count(ext_mime)); + printf("#define MIME_ENTRIES_FLOOR %d\n", entries_floor); + printf("#define MIME_ENTRY_FALLBACK %ld\n", bin_index); printf("static const unsigned char mime_entries_compressed[] = {\n"); for (i = 1; compressed_size; compressed_size--, i++) printf("0x%02x,%c", compressed[i - 1] & 0xff, " \n"[i % 13 == 0]); diff --git a/src/lib/lwan-tables.c b/src/lib/lwan-tables.c index 6a63508fd..bcbc2d159 100644 --- a/src/lib/lwan-tables.c +++ b/src/lib/lwan-tables.c @@ -36,6 +36,7 @@ static unsigned char uncompressed_mime_entries[MIME_UNCOMPRESSED_LEN]; static char *mime_types[MIME_ENTRIES]; +static uint64_t *mime_extensions; static bool mime_entries_initialized = false; void lwan_tables_shutdown(void) @@ -86,6 +87,7 @@ void lwan_tables_init(void) mime_types[i] = (char *)ptr; ptr += strlen((const char *)ptr) + 1; } + mime_extensions = (uint64_t *)uncompressed_mime_entries; mime_entries_initialized = true; @@ -120,34 +122,25 @@ LWAN_SELF_TEST(status_codes) #undef ASSERT_STATUS } -static int compare_mime_entry(const void *a, const void *b) +static ALWAYS_INLINE const char *bsearch_mime_type(uint64_t ext) { - const uint64_t exta = string_as_uint64((const char *)a); - const uint64_t extb = string_as_uint64((const char *)b); - - return (exta > extb) - (exta < extb); + /* Based on https://orlp.net/blog/bitwise-binary-search/ */ + int64_t b = ext > mime_extensions[MIME_ENTRIES / 2] + ? MIME_ENTRIES - MIME_ENTRIES_FLOOR + : -1; + for (uint64_t bit = MIME_ENTRIES_FLOOR >> 1; bit != 0; bit >>= 1) { + if (ext > mime_extensions[b + (int64_t)bit]) + b += (int64_t)bit; + } + return mime_types[mime_extensions[b + 1] == ext ? b + 1 + : MIME_ENTRY_FALLBACK]; } -const char * -lwan_determine_mime_type_for_file_name(const char *file_name) +const char *lwan_determine_mime_type_for_file_name(const char *file_name) { char *last_dot = strrchr(file_name, '.'); - if (UNLIKELY(!last_dot)) - goto fallback; - - STRING_SWITCH_L(last_dot) { - case STR4_INT_L('.','c','s','s'): return "text/css"; - case STR4_INT_L('.','g','i','f'): return "image/gif"; - case STR4_INT_L('.','h','t','m'): return "text/html"; - case STR4_INT_L('.','j','p','g'): return "image/jpeg"; - case STR4_INT_L('.','j','s',' '): return "text/javascript"; - case STR4_INT_L('.','p','n','g'): return "image/png"; - case STR4_INT_L('.','t','x','t'): return "text/plain"; - } - - if (LIKELY(*last_dot)) { + if (LIKELY(last_dot && *last_dot)) { uint64_t key = 0; - const unsigned char *extension; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" @@ -157,17 +150,11 @@ lwan_determine_mime_type_for_file_name(const char *file_name) * 8 bytes per extension. */ strncpy((char *)&key, last_dot + 1, 8); #pragma GCC diagnostic pop - key &= ~0x2020202020202020ull; - key = htobe64(key); - extension = bsearch(&key, uncompressed_mime_entries, MIME_ENTRIES, 8, - compare_mime_entry); - if (LIKELY(extension)) - return mime_types[(extension - uncompressed_mime_entries) / 8]; + return bsearch_mime_type(htobe64(key & ~0x2020202020202020ull)); } -fallback: - return "application/octet-stream"; + return mime_types[MIME_ENTRY_FALLBACK]; } #include "lookup-http-status.h" /* genrated by statuslookupgen */