MDS: Roll an in-house solution for UTF-16 to UTF-8 conversion based on code from StackExchange.

2025-07-24 23:45:25 +02:00
parent 42ceb2ed33
commit 82cab486b2
2 changed files with 89 additions and 13 deletions
--- a/src/cdrom/CMakeLists.txt
+++ b/src/cdrom/CMakeLists.txt
@@ -18,7 +18,6 @@
 find_package(PkgConfig REQUIRED)

 pkg_check_modules(SNDFILE REQUIRED IMPORTED_TARGET sndfile)
-pkg_check_modules(ICONV REQUIRED IMPORTED_TARGET iconv)

 add_library(cdrom OBJECT
    cdrom.c
@@ -26,7 +25,6 @@ add_library(cdrom OBJECT
    cdrom_image_viso.c
 )
 target_link_libraries(86Box PkgConfig::SNDFILE)
-target_link_libraries(86Box PkgConfig::ICONV)

 if(CDROM_MITSUMI)
    target_compile_definitions(cdrom PRIVATE USE_CDROM_MITSUMI)
@@ -35,5 +33,5 @@ endif()

 if (WIN32)
    # MSYS2
-    target_link_libraries(86Box -static ${ICONV_STATIC_LIBRARIES} ${SNDFILE_STATIC_LIBRARIES})
+    target_link_libraries(86Box -static ${SNDFILE_STATIC_LIBRARIES})
 endif()
--- a/src/cdrom/cdrom_image.c
+++ b/src/cdrom/cdrom_image.c
@@ -29,7 +29,6 @@
 #include <wchar.h>
 #include <sys/stat.h>
 #ifndef _WIN32
-#    include <iconv.h>
 #    include <libgen.h>
 #endif
 #include <86box/86box.h>
@@ -1808,6 +1807,91 @@ image_load_cue(cd_image_t *img, const char *cuefile)
    return success;
 }

+// Converts UTF-16 string into UTF-8 string.
+// If destination string is NULL returns total number of symbols that would've
+// been written (without null terminator). However, when actually writing into
+// destination string, it does include it. So, be sure to allocate extra byte
+// for destination string.
+// Params:
+// u16_str      - source UTF-16 string
+// u16_str_len  - length of source UTF-16 string
+// u8_str       - destination UTF-8 string
+// u8_str_size  - size of destination UTF-8 string in bytes
+// Return value:
+// 0 on success, -1 if encountered invalid surrogate pair, -2 if
+// encountered buffer overflow or length of destination UTF-8 string in bytes
+// (without including the null terminator).
+long int utf16_to_utf8(const uint16_t *u16_str, size_t u16_str_len,
+                       uint8_t *u8_str, size_t u8_str_size)
+{
+    size_t i = 0, j = 0;
+
+    if (!u8_str) {
+        u8_str_size = u16_str_len * 4;
+    }
+
+    while (i < u16_str_len) {
+        uint32_t codepoint = u16_str[i++];
+
+        // check for surrogate pair
+        if (codepoint >= 0xD800 && codepoint <= 0xDBFF) {
+            uint16_t high_surr = codepoint;
+            uint16_t low_surr  = u16_str[i++];
+
+            if (low_surr < 0xDC00 || low_surr > 0xDFFF)
+                return -1;
+
+            codepoint = ((high_surr - 0xD800) << 10) +
+                        (low_surr - 0xDC00) + 0x10000;
+        }
+
+        if (codepoint < 0x80) {
+            if (j + 1 > u8_str_size) return -2;
+
+            if (u8_str) u8_str[j] = (char)codepoint;
+
+            j++;
+        } else if (codepoint < 0x800) {
+            if (j + 2 > u8_str_size) return -2;
+
+            if (u8_str) {
+                u8_str[j + 0] = 0xC0 | (codepoint >> 6);
+                u8_str[j + 1] = 0x80 | (codepoint & 0x3F);
+            }
+
+            j += 2;
+        } else if (codepoint < 0x10000) {
+            if (j + 3 > u8_str_size) return -2;
+
+            if (u8_str) {
+                u8_str[j + 0] = 0xE0 | (codepoint >> 12);
+                u8_str[j + 1] = 0x80 | ((codepoint >> 6) & 0x3F);
+                u8_str[j + 2] = 0x80 | (codepoint & 0x3F);
+            }
+
+            j += 3;
+        } else {
+            if (j + 4 > u8_str_size) return -2;
+
+            if (u8_str) {
+                u8_str[j + 0] = 0xF0 | (codepoint >> 18);
+                u8_str[j + 1] = 0x80 | ((codepoint >> 12) & 0x3F);
+                u8_str[j + 2] = 0x80 | ((codepoint >> 6) & 0x3F);
+                u8_str[j + 3] = 0x80 | (codepoint & 0x3F);
+            }
+
+            j += 4;
+        }
+    }
+
+    if (u8_str) {
+        if (j >= u8_str_size) return -2;
+        u8_str[j] = '\0';
+    }
+
+    return (long int)j;
+}
+
 static int
 image_load_mds(cd_image_t *img, const char *mdsfile)
 {
@@ -1989,20 +2073,14 @@ image_load_mds(cd_image_t *img, const char *mdsfile)
                char     fn[2048] = { 0 };
                fseek(fp, mds_footer.fn_offs, SEEK_SET);
                if (mds_footer.fn_is_wide) {
+                    int len = 0;
                    for (int i = 0; i < 256; i++) {
                        fread(&wfn[i], 1, 2, fp);
+                        len++;
                        if (wfn[i] == 0x0000)
                            break;
                    }
-#ifdef _WIN32
-                    wcstombs(fn, wfn, 256);
-#else
-                    int src_len = 256;
-                    int dst_len = 512;
-                    iconv_t conv = iconv_open("UTF-8", "UTF-16");
-                    iconv(conv, (char **) &wfn, &src_len, &fn, &dst_len);
-                    iconv_close(conv);
-#endif
+                    (void) utf16_to_utf8(wfn, 2048, (uint8_t *) fn, 2048);
                } else  for (int i = 0; i < 512; i++) {
                    fread(&fn[i], 1, 1, fp);
                    if (fn[i] == 0x00)