view test/testiconv.c @ 3539:f2846bf19360

Fixed bug #896 John Popplewell 2009-12-08 23:05:50 PST Originally reported by AKFoerster on the mailing list. Error decoding UTF8 Russian text to UTF-16LE on Windows, but specifically on platforms without iconv support (the default on Windows). Valid UTF8 characters are flagged as being overlong and then substituted by the UNKNOWN_UNICODE character. After studying the testiconv.c example program, reading the RFCs and putting some printf statements in SDL_iconv.c the problem is in a test for 'Maximum overlong sequences', specifically 4.2.1, which is carried out by the following code: } else if ( p[0] >= 0xC0 ) { if ( (p[0] & 0xE0) != 0xC0 ) { /* Skip illegal sequences return SDL_ICONV_EILSEQ; */ ch = UNKNOWN_UNICODE; } else { if ( (p[0] & 0xCE) == 0xC0 ) { <<<<<<<< here overlong = SDL_TRUE; } ch = (Uint32)(p[0] & 0x1F); left = 1; } } else { Here is the 2-byte encoding of a character in range 00000080 - 000007FF 110xxxxx 10xxxxxx The line in question is supposed to be checking for an overlong sequence which would be less than 11000001 10111111 which should be represented as a single byte. BUT, the mask value (0xCE) is wrong, it isn't checking the top-most bit: 11000001 value 11001110 mask (incorrect) ^ and should be (0xDE): 11000001 value 11011110 mask (correct) making the above code: } else if ( p[0] >= 0xC0 ) { if ( (p[0] & 0xE0) != 0xC0 ) { /* Skip illegal sequences return SDL_ICONV_EILSEQ; */ ch = UNKNOWN_UNICODE; } else { if ( (p[0] & 0xDE) == 0xC0 ) { <<<<<<<< here overlong = SDL_TRUE; } ch = (Uint32)(p[0] & 0x1F); left = 1; } } else { I can supply a test program and/or a patch if required, best regards, John Popplewell
author Sam Lantinga <slouken@libsdl.org>
date Fri, 11 Dec 2009 08:03:43 +0000
parents e906da4414a3
children
line wrap: on
line source


#include <stdio.h>

#include "SDL.h"

static size_t
widelen(char *data)
{
    size_t len = 0;
    Uint32 *p = (Uint32 *) data;
    while (*p++) {
        ++len;
    }
    return len;
}

int
main(int argc, char *argv[])
{
    const char *formats[] = {
        "UTF8",
        "UTF-8",
        "UTF16BE",
        "UTF-16BE",
        "UTF16LE",
        "UTF-16LE",
        "UTF32BE",
        "UTF-32BE",
        "UTF32LE",
        "UTF-32LE",
        "UCS4",
        "UCS-4",
    };
    char buffer[BUFSIZ];
    char *ucs4;
    char *test[2];
    int i, index = 0;
    FILE *file;
    int errors = 0;

    if (!argv[1]) {
        argv[1] = "utf8.txt";
    }
    file = fopen(argv[1], "rb");
    if (!file) {
        fprintf(stderr, "Unable to open %s\n", argv[1]);
        return (1);
    }

    while (fgets(buffer, sizeof(buffer), file)) {
        /* Convert to UCS-4 */
        size_t len;
        ucs4 =
            SDL_iconv_string("UCS-4", "UTF-8", buffer,
                             SDL_strlen(buffer) + 1);
        len = (widelen(ucs4) + 1) * 4;
        for (i = 0; i < SDL_arraysize(formats); ++i) {
            test[0] = SDL_iconv_string(formats[i], "UCS-4", ucs4, len);
            test[1] = SDL_iconv_string("UCS-4", formats[i], test[0], len);
            if (!test[1] || SDL_memcmp(test[1], ucs4, len) != 0) {
                fprintf(stderr, "FAIL: %s\n", formats[i]);
                ++errors;
            }
            if (test[0]) {
                SDL_free(test[0]);
            }
            if (test[1]) {
                SDL_free(test[1]);
            }
        }
        test[0] = SDL_iconv_string("UTF-8", "UCS-4", ucs4, len);
        SDL_free(ucs4);
        fputs(test[0], stdout);
        SDL_free(test[0]);
    }
    return (errors ? errors + 1 : 0);
}