X-Git-Url: https://git.ucc.asn.au/?a=blobdiff_plain;f=Usermode%2FLibraries%2Flibunicode.so_src%2Futf-8.c;fp=Usermode%2FLibraries%2Flibunicode.so_src%2Futf-8.c;h=3aa9d1a8dbcf9449d2b6c2fbf10dfb9989229346;hb=6a99a6d70179161964d47de9a825fd61e8445b86;hp=0000000000000000000000000000000000000000;hpb=265bcb9e6fd6611eda6bba3aed13da83e584e058;p=tpg%2Facess2.git diff --git a/Usermode/Libraries/libunicode.so_src/utf-8.c b/Usermode/Libraries/libunicode.so_src/utf-8.c new file mode 100644 index 00000000..3aa9d1a8 --- /dev/null +++ b/Usermode/Libraries/libunicode.so_src/utf-8.c @@ -0,0 +1,153 @@ +/* + * Acess2 "libunicode" UTF Parser + * - By John Hodge (thePowersGang) + * + * utf-8.c + * - UTF-8 Parsing code + */ +#include +#include + +/** + * \brief Read a UTF-8 character from a string + * \param Input Source UTF-8 encoded string + * \param Val Destination for read codepoint + * \return Number of bytes read/used + */ +int ReadUTF8(const char *Input, uint32_t *Val) +{ + const uint8_t *str = (const uint8_t *)Input; + *Val = 0xFFFD; // Assume invalid character + + // ASCII + if( !(*str & 0x80) ) { + *Val = *str; + return 1; + } + + // Middle of a sequence + if( (*str & 0xC0) == 0x80 ) { + return 1; + } + + // Two Byte + if( (*str & 0xE0) == 0xC0 ) { + *Val = (*str & 0x1F) << 6; // Upper 6 Bits + str ++; + if( (*str & 0xC0) != 0x80) return -1; // Validity check + *Val |= (*str & 0x3F); // Lower 6 Bits + return 2; + } + + // Three Byte + if( (*str & 0xF0) == 0xE0 ) { + *Val = (*str & 0x0F) << 12; // Upper 4 Bits + str ++; + if( (*str & 0xC0) != 0x80) return -1; // Validity check + *Val |= (*str & 0x3F) << 6; // Middle 6 Bits + str ++; + if( (*str & 0xC0) != 0x80) return -1; // Validity check + *Val |= (*str & 0x3F); // Lower 6 Bits + return 3; + } + + // Four Byte + if( (*str & 0xF8) == 0xF0 ) { + *Val = (*str & 0x07) << 18; // Upper 3 Bits + str ++; + if( (*str & 0xC0) != 0x80) return -1; // Validity check + *Val |= (*str & 0x3F) << 12; // Middle-upper 6 Bits + str ++; + if( (*str & 0xC0) != 0x80) return -1; // Validity check + *Val |= (*str & 0x3F) << 6; // Middle-lower 6 Bits + str ++; + if( (*str & 0xC0) != 0x80) return -1; // Validity check + *Val |= (*str & 0x3F); // Lower 6 Bits + return 4; + } + + // UTF-8 Doesn't support more than four bytes + return 4; +} + +/** + * \brief Get the UTF-8 character before the + * \ + */ +int ReadUTF8Rev(const char *Base, int Offset, uint32_t *Val) +{ + int len = 0; + + // Scan backwards for the beginning of the character + while( Offset > 0 && (Base[Offset--] & 0xC0) == 0x80 ) + len ++; + // Invalid string (no beginning) + if(Offset == 0 && (Base[Offset] & 0xC0) == 0x80 ) + return len; + + len ++; // First character + if( ReadUTF8(Base+Offset, Val) != len ) { + *Val = 0xFFFD; + } + return len; +} + +/** + * \brief Write a UTF-8 character sequence to a string + * \param buf Destination buffer (must have at least 4 bytes available) + * \param Val Unicode codepoint to write + * \return Number of bytes written + * \note Does not NULL terminate the string in \a buf + */ +int WriteUTF8(char *buf, uint32_t Val) +{ + uint8_t *str = (void*)buf; + + // ASCII + if( Val < 128 ) { + if(str) { + *str = Val; + } + return 1; + } + + // Two Byte + if( Val < 0x8000 ) { + if(str) { + *str = 0xC0 | (Val >> 6); + str ++; + *str = 0x80 | (Val & 0x3F); + } + return 2; + } + + // Three Byte + if( Val < 0x10000 ) { + if(str) { + *str = 0xE0 | (Val >> 12); + str ++; + *str = 0x80 | ((Val >> 6) & 0x3F); + str ++; + *str = 0x80 | (Val & 0x3F); + } + return 3; + } + + // Four Byte + if( Val < 0x110000 ) { + if(str) { + *str = 0xF0 | (Val >> 18); + str ++; + *str = 0x80 | ((Val >> 12) & 0x3F); + str ++; + *str = 0x80 | ((Val >> 6) & 0x3F); + str ++; + *str = 0x80 | (Val & 0x3F); + } + return 4; + } + + // UTF-8 Doesn't support more than four bytes + return 0; +} +