From d263e6eec0b44c602a47bd301af90865303b8563 Mon Sep 17 00:00:00 2001 From: Josh Coalson Date: Wed, 24 Aug 2005 07:38:13 +0000 Subject: [PATCH] disallow non-shortest-form encodings in utf-8 checking; when converting utf-8 to ucs-2, replace code points outside of ucs-2 with ? character --- src/plugin_common/tags.c | 56 +++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/src/plugin_common/tags.c b/src/plugin_common/tags.c index 8c3008b6..f580f44f 100644 --- a/src/plugin_common/tags.c +++ b/src/plugin_common/tags.c @@ -33,19 +33,54 @@ static __inline unsigned local__wide_strlen(const FLAC__uint16 *s) return n; } +/* + * also disallows non-shortest-form encodings, c.f. + * http://www.unicode.org/versions/corrigendum1.html + * and a more clear explanation at the end of this section: + * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 + */ static __inline unsigned local__utf8len(const FLAC__byte *utf8) { FLAC__ASSERT(0 != utf8); - if ((utf8[0] & 0x80) == 0) + if ((utf8[0] & 0x80) == 0) { return 1; - else if ((utf8[0] & 0xE0) == 0xC0 && (utf8[1] & 0xC0) == 0x80) + } + else if ((utf8[0] & 0xE0) == 0xC0 && (utf8[1] & 0xC0) == 0x80) { + if ((utf8[0] & 0x01) == 0xC0) /* overlong sequence check */ + return 0; return 2; - else if ((utf8[0] & 0xF0) == 0xE0 && (utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80) + } + else if ((utf8[0] & 0xF0) == 0xE0 && (utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80) { + if (utf8[0] == 0xE0 && (utf8[1] & 0xE0) == 0x80) /* overlong sequence check */ + return 0; + /* illegal surrogates check (U+D800...U+DFFF and U+FFFE...U+FFFF) */ + if (utf8[0] == 0xED && (utf8[1] & 0xE0) == 0xA0) /* D800-DFFF */ + return 0; + if (utf8[0] == 0xEF && utf8[1] == 0xBF && (utf8[2] & 0xFE) == 0xBE) /* FFFE-FFFF */ + return 0; return 3; - else + } + else if ((utf8[0] & 0xF8) == 0xF0 && (utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80 && (utf8[3] & 0xC0) == 0x80) { + if (utf8[0] == 0xF0 && (utf8[1] & 0xF0) == 0x80) /* overlong sequence check */ + return 0; + return 4; + } + else if ((utf8[0] & 0xFC) == 0xF8 && (utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80 && (utf8[3] & 0xC0) == 0x80 && (utf8[4] & 0xC0) == 0x80) { + if (utf8[0] == 0xF8 && (utf8[1] & 0xF8) == 0x80) /* overlong sequence check */ + return 0; + return 5; + } + else if ((utf8[0] & 0xFE) == 0xFC && (utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80 && (utf8[3] & 0xC0) == 0x80 && (utf8[4] & 0xC0) == 0x80 && (utf8[5] & 0xC0) == 0x80) { + if (utf8[0] == 0xFC && (utf8[1] & 0xFC) == 0x80) /* overlong sequence check */ + return 0; + return 6; + } + else { return 0; + } } + static __inline unsigned local__utf8_to_ucs2(const FLAC__byte *utf8, FLAC__uint16 *ucs2) { const unsigned len = local__utf8len(utf8); @@ -58,6 +93,8 @@ static __inline unsigned local__utf8_to_ucs2(const FLAC__byte *utf8, FLAC__uint1 *ucs2 = (*utf8 & 0x3F)<<6 | (*(utf8+1) & 0x3F); else if (len == 3) *ucs2 = (*utf8 & 0x1F)<<12 | (*(utf8+1) & 0x3F)<<6 | (*(utf8+2) & 0x3F); + else + *ucs2 = '?'; return len; } @@ -71,8 +108,8 @@ static FLAC__uint16 *local__convert_utf8_to_ucs2(const char *src, unsigned lengt /* calculate length */ { - const char *s, *end; - for (s=src, end=src+length; sdata.vorbis_comment.comments[i].entry, '=')+1); + return (i < 0? 0 : strchr((const char *)tags->data.vorbis_comment.comments[i].entry, '=')+1); } FLAC__uint16 *FLAC_plugin__tags_get_tag_ucs2(const FLAC__StreamMetadata *tags, const char *name)