2 * Acess2 "libunicode" UTF Parser
3 * - By John Hodge (thePowersGang)
12 * \brief Read a UTF-8 character from a string
13 * \param Input Source UTF-8 encoded string
14 * \param Val Destination for read codepoint
15 * \return Number of bytes read/used
17 int ReadUTF8(const char *Input, uint32_t *Val)
19 const uint8_t *str = (const uint8_t *)Input;
21 if(Val) *Val = 0xFFFD; // Assume invalid character
26 // Middle of a sequence
27 if( (*str & 0xC0) == 0x80 ) {
32 if( !(*str & 0x80) ) {
37 else if( (*str & 0xE0) == 0xC0 ) {
38 val = (*str & 0x1F) << 6; // Upper 6 Bits
40 if( (*str & 0xC0) != 0x80) return -1; // Validity check
41 val |= (*str & 0x3F); // Lower 6 Bits
45 else if( (*str & 0xF0) == 0xE0 ) {
46 val = (*str & 0x0F) << 12; // Upper 4 Bits
48 if( (*str & 0xC0) != 0x80) return -1; // Validity check
49 val |= (*str & 0x3F) << 6; // Middle 6 Bits
51 if( (*str & 0xC0) != 0x80) return -1; // Validity check
52 val |= (*str & 0x3F); // Lower 6 Bits
56 else if( (*str & 0xF8) == 0xF0 ) {
57 val = (*str & 0x07) << 18; // Upper 3 Bits
59 if( (*str & 0xC0) != 0x80) return -1; // Validity check
60 val |= (*str & 0x3F) << 12; // Middle-upper 6 Bits
62 if( (*str & 0xC0) != 0x80) return -1; // Validity check
63 val |= (*str & 0x3F) << 6; // Middle-lower 6 Bits
65 if( (*str & 0xC0) != 0x80) return -1; // Validity check
66 val |= (*str & 0x3F); // Lower 6 Bits
69 // UTF-8 Doesn't support more than four bytes
81 * \brief Get the UTF-8 character before the
84 int ReadUTF8Rev(const char *Base, int Offset, uint32_t *Val)
88 // Scan backwards for the beginning of the character
89 while( Offset > 0 && (Base[Offset--] & 0xC0) == 0x80 )
91 // Invalid string (no beginning)
92 if(Offset == 0 && (Base[Offset] & 0xC0) == 0x80 )
95 len ++; // First character
96 if( ReadUTF8(Base+Offset, Val) != len ) {
103 * \brief Write a UTF-8 character sequence to a string
104 * \param buf Destination buffer (must have at least 4 bytes available)
105 * \param Val Unicode codepoint to write
106 * \return Number of bytes written
107 * \note Does not NULL terminate the string in \a buf
109 int WriteUTF8(char *buf, uint32_t Val)
111 uint8_t *str = (void*)buf;
124 *str = 0xC0 | (Val >> 6);
126 *str = 0x80 | (Val & 0x3F);
132 if( Val < 0x10000 ) {
134 *str = 0xE0 | (Val >> 12);
136 *str = 0x80 | ((Val >> 6) & 0x3F);
138 *str = 0x80 | (Val & 0x3F);
144 if( Val < 0x110000 ) {
146 *str = 0xF0 | (Val >> 18);
148 *str = 0x80 | ((Val >> 12) & 0x3F);
150 *str = 0x80 | ((Val >> 6) & 0x3F);
152 *str = 0x80 | (Val & 0x3F);
157 // UTF-8 Doesn't support more than four bytes