Initial unicode implementation for UTF8 encoded text

author: Demizdor <avalorrian@gmail.com> 2019-04-21 12:27:46 +0300
committer: Demizdor <avalorrian@gmail.com> 2019-04-21 12:27:46 +0300
commit: f3a5a6871d4ec005026817e5f2579a6f55938dc2 (patch)
tree: 824844999de400efeca863d7b0051018dc6f1536
parent: 8c22f685d168000eabfc994a09c3a2a61f7f633f (diff)
download: raylib-f3a5a6871d4ec005026817e5f2579a6f55938dc2.tar.gz
raylib-f3a5a6871d4ec005026817e5f2579a6f55938dc2.zip
2 files changed, 145 insertions, 62 deletions
diff --git a/src/raylib.h b/src/raylib.h
index 0afa69d0..6a5f0ef8 100644
--- a/src/raylib.h
+++ b/src/raylib.h
@@ -1190,11 +1190,13 @@ RLAPI void DrawTextRecEx(Font font, const char *text, Rectangle rec, float fontS
 RLAPI int MeasureText(const char *text, int fontSize);                                      // Measure string width for default font
 RLAPI Vector2 MeasureTextEx(Font font, const char *text, float fontSize, float spacing);    // Measure string size for Font
 RLAPI int GetGlyphIndex(Font font, int character);                                          // Get index position for a unicode character on font
+RLAPI int GetNextCodepoint(const char* text, int* count);                                   // Returns next codepoint in a UTF8 encoded `text` or 0x3f(`?`) on failure. `count` will hold the total number of bytes processed.
 
 // Text strings management functions
 // NOTE: Some strings allocate memory internally for returned strings, just be careful!
 RLAPI bool TextIsEqual(const char *text1, const char *text2);                               // Check if two text string are equal
 RLAPI unsigned int TextLength(const char *text);                                            // Get text length, checks for '\0' ending
+RLAPI unsigned int TextCountCodepoints(const char *text);                                   // Get total number of characters(codepoints) in a UTF8 encoded `text` until '\0' is found. 
 RLAPI const char *TextFormat(const char *text, ...);                                        // Text formatting with variables (sprintf style)
 RLAPI const char *TextSubtext(const char *text, int position, int length);                  // Get a piece of a text string
 RLAPI const char *TextReplace(char *text, const char *replace, const char *by);             // Replace text string (memory should be freed!)
diff --git a/src/text.c b/src/text.c
index c07f807a..bd9e09f0 100644
--- a/src/text.c
+++ b/src/text.c
@@ -719,6 +719,97 @@ void DrawFPS(int posX, int posY)
     DrawText(TextFormat("%2i FPS", fps), posX, posY, 20, LIME);
 }
 
+// Returns next codepoint in a UTF8 encoded `text` scanning until '\0' is found. When a invalid UTF8 byte is encountered we exit as soon
+// as possible and a `?`(0x3f) codepoint is returned. `count` will hold the total number of bytes processed.
+// NOTE: the standard says U+FFFD should be returned in case of errors but that character is not supported by the default font in raylib
+// TODO: optimize this code for speed!!
+int GetNextCodepoint(const char* text, int* count)
+{
+/* 
+   UTF8 specs from https://www.ietf.org/rfc/rfc3629.txt
+
+   Char. number range  |        UTF-8 octet sequence
+      (hexadecimal)    |              (binary)
+   --------------------+---------------------------------------------
+   0000 0000-0000 007F | 0xxxxxxx
+   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+*/
+    
+    // NOTE: on decode errors we return as soon as possible
+    
+    int c = 0x3f;   // Codepoint (defaults to `?`)
+    int o = (unsigned char)(text[0]); // The first UTF8 octet
+    *count = 1;
+    
+    if( o <= 0x7f )
+    { 
+        // Only one octet (ASCII range x00-7F)
+        c = text[0];
+    }
+    else if((o & 0xe0) == 0xc0)
+    {
+        // Two octets 
+        // [0]xC2-DF    [1]UTF8-tail(x80-BF)
+        unsigned char o1 = text[1];
+        if(o1 == '\0' || (o1 >> 6) != 2 ) {*count = 2; return c; } // Unexpected sequence
+        if(o >= 0xc2 && o <= 0xdf) 
+        {
+            c = ((o & 0x1f) << 6) | (o1 & 0x3f);
+            *count = 2;
+        }
+    }
+    else if( (o & 0xf0) == 0xe0 )
+    {
+        // Three octets
+        unsigned char o1 = text[1], o2 = '\0';
+        if(o1 == '\0' || (o1 >> 6) != 2) { *count = 2; return c; } // Unexpected sequence
+        o2 = text[2];
+        if(o2 == '\0' || (o2 >> 6) != 2) {*count = 3; return c; } // Unexpected sequence
+        
+        /* [0]xE0    [1]xA0-BF       [2]UTF8-tail(x80-BF)
+           [0]xE1-EC [1]UTF8-tail    [2]UTF8-tail(x80-BF)
+           [0]xED    [1]x80-9F       [2]UTF8-tail(x80-BF)
+           [0]xEE-EF [1]UTF8-tail    [2]UTF8-tail(x80-BF)
+        */
+        
+        if((o == 0xe0 && !(o1 >= 0xa0 && o1 <= 0xbf)) || (o == 0xed && !(o1 >= 0x80 && o1 <= 0x9f)) ) {*count = 2; return c;}
+        if(o >= 0xe0 && 0 <= 0xef)
+        {
+            c = ((o & 0xf) << 12) | ((o1 & 0x3f) << 6) | (o2 & 0x3f);
+            *count = 3;
+        }
+    }
+    else if( (o & 0xf8) == 0xf0 ) 
+    {
+        // Four octets
+        if(o > 0xf4) return c;
+        
+        unsigned char o1 = text[1], o2 = '\0', o3 = '\0';
+        if(o1 == '\0' || (o1 >> 6) != 2) { *count = 2; return c; }  // Unexpected sequence
+        o2 = text[2];
+        if(o2 == '\0' || (o2 >> 6) != 2) { *count = 3; return c; }  // Unexpected sequence
+        o3 = text[3];
+        if(o3 == '\0' || (o3 >> 6) != 2) { *count = 4; return c; }  // Unexpected sequence
+        
+        /* [0]xF0       [1]x90-BF       [2]UTF8-tail  [3]UTF8-tail
+           [0]xF1-F3    [1]UTF8-tail    [2]UTF8-tail  [3]UTF8-tail
+           [0]xF4       [1]x80-8F       [2]UTF8-tail  [3]UTF8-tail
+        */
+        if((o == 0xf0 && !(o1 >= 0x90 && o1 <= 0xbf)) || (o == 0xf4 && !( o1 >= 0x80 && o1 <= 0x8f)) ) { *count = 2; return c; } // Unexpected sequence
+        if( o >= 0xf0)
+        {
+            c = ((o & 0x7) << 18) | ((o1 & 0x3f) << 12) | ((o2 & 0x3f) << 6) | (o3 & 0x3f);
+            *count = 4;
+        }
+    }
+    
+    if(c > 0x10ffff) c = 0x3f; // Codepoints after U+10ffff are invalid
+    return c;
+}
+
+
 // Draw text (using default font)
 // NOTE: fontSize work like in any drawing program but if fontSize is lower than font-base-size, then font-base-size is used
 // NOTE: chars spacing is proportional to fontSize
@@ -746,17 +837,22 @@ void DrawTextEx(Font font, const char *text, Vector2 position, float fontSize, f
     int textOffsetY = 0;        // Required for line break!
     float scaleFactor = 0.0f;
 
-    unsigned char letter = 0;   // Current character
+    int letter = 0;             // Current character
     int index = 0;              // Index position in sprite font
 
     scaleFactor = fontSize/font.baseSize;
 
-    // NOTE: Some ugly hacks are made to support Latin-1 Extended characters directly
-    // written in C code files (codified by default as UTF-8)
-
     for (int i = 0; i < length; i++)
     {
-        if ((unsigned char)text[i] == '\n')
+        int next = 1;
+        letter = GetNextCodepoint(&text[i], &next);
+        // NOTE: normally we exit the decoding sequence as soon as a bad byte is found (and return 0x3f)
+        // but we need to draw all of the bad bytes using the '?' symbol so to not skip any we set `next = 1`
+        if(letter == 0x3f) next = 1; 
+        index = GetGlyphIndex(font, letter);
+        i += next - 1;
+        
+        if (letter == '\n')
         {
             // NOTE: Fixed line spacing of 1.5 lines
             textOffsetY += (int)((font.baseSize + font.baseSize/2)*scaleFactor);
@@ -764,23 +860,7 @@ void DrawTextEx(Font font, const char *text, Vector2 position, float fontSize, f
         }
         else
         {
-            if ((unsigned char)text[i] == 0xc2)         // UTF-8 encoding identification HACK!
-            {
-                // Support UTF-8 encoded values from [0xc2 0x80] -> [0xc2 0xbf](¿)
-                letter = (unsigned char)text[i + 1];
-                index = GetGlyphIndex(font, (int)letter);
-                i++;
-            }
-            else if ((unsigned char)text[i] == 0xc3)    // UTF-8 encoding identification HACK!
-            {
-                // Support UTF-8 encoded values from [0xc3 0x80](À) -> [0xc3 0xbf](ÿ)
-                letter = (unsigned char)text[i + 1];
-                index = GetGlyphIndex(font, (int)letter + 64);
-                i++;
-            }
-            else index = GetGlyphIndex(font, (unsigned char)text[i]);
-
-            if ((unsigned char)text[i] != ' ')
+            if (letter != ' ')
             {
                 DrawTexturePro(font.texture, font.chars[index].rec,
                            (Rectangle){ position.x + textOffsetX + font.chars[index].offsetX*scaleFactor,
@@ -810,7 +890,7 @@ void DrawTextRecEx(Font font, const char *text, Rectangle rec, float fontSize, f
     int textOffsetY = 0;        // Required for line break!
     float scaleFactor = 0.0f;
 
-    unsigned char letter = 0;   // Current character
+    int letter = 0;             // Current character
     int index = 0;              // Index position in sprite font
 
     scaleFactor = fontSize/font.baseSize;
@@ -823,26 +903,16 @@ void DrawTextRecEx(Font font, const char *text, Rectangle rec, float fontSize, f
     for (int i = 0; i < length; i++)
     {
         int glyphWidth = 0;
-        letter = (unsigned char)text[i];
+        int next = 1;
+        letter = GetNextCodepoint(&text[i], &next);
+        // NOTE: normally we exit the decoding sequence as soon as a bad byte is found (and return 0x3f)
+        // but we need to draw all of the bad bytes using the '?' symbol so to not skip any we set `next = 1`
+        if(letter == 0x3f) next = 1; 
+        index = GetGlyphIndex(font, letter);
+        i += next - 1;
 
         if (letter != '\n')
-        {
-            if ((unsigned char)text[i] == 0xc2)         // UTF-8 encoding identification HACK!
-            {
-                // Support UTF-8 encoded values from [0xc2 0x80] -> [0xc2 0xbf](¿)
-                letter = (unsigned char)text[i + 1];
-                index = GetGlyphIndex(font, (int)letter);
-                i++;
-            }
-            else if ((unsigned char)text[i] == 0xc3)    // UTF-8 encoding identification HACK!
-            {
-                // Support UTF-8 encoded values from [0xc3 0x80](À) -> [0xc3 0xbf](ÿ)
-                letter = (unsigned char)text[i + 1];
-                index = GetGlyphIndex(font, (int)letter + 64);
-                i++;
-            }
-            else index = GetGlyphIndex(font, (unsigned char)text[i]);
-
+        {   
             glyphWidth = (font.chars[index].advanceX == 0)?
                          (int)(font.chars[index].rec.width*scaleFactor + spacing):
                          (int)(font.chars[index].advanceX*scaleFactor + spacing);
@@ -858,13 +928,15 @@ void DrawTextRecEx(Font font, const char *text, Rectangle rec, float fontSize, f
         // the container.
         if (state == MEASURE_STATE)
         {
+            // TODO: there are multiple types of `spaces` in UNICODE, maybe it's a good idea to add support for more
+            // see: http://jkorpela.fi/chars/spaces.html 
             if ((letter == ' ') || (letter == '\t') || (letter == '\n')) endLine = i;
 
             if ((textOffsetX + glyphWidth + 1) >= rec.width)
             {
                 endLine = (endLine < 1)? i : endLine;
-                if (i == endLine) endLine -= 1;
-                if ((startLine + 1) == endLine) endLine = i - 1;
+                if (i == endLine) endLine -= next;
+                if ((startLine + next) == endLine) endLine = i - next;
                 state = !state;
             }
             else if ((i + 1) == length)
@@ -972,31 +1044,23 @@ Vector2 MeasureTextEx(Font font, const char *text, float fontSize, float spacing
     float textHeight = (float)font.baseSize;
     float scaleFactor = fontSize/(float)font.baseSize;
 
-    unsigned char letter = 0;       // Current character
+    int letter = 0;                 // Current character
     int index = 0;                  // Index position in sprite font
 
     for (int i = 0; i < len; i++)
     {
         lenCounter++;
-
-        if (text[i] != '\n')
+        
+        int next = 1;
+        letter = GetNextCodepoint(&text[i], &next);
+        // NOTE: normally we exit the decoding sequence as soon as a bad byte is found (and return 0x3f)
+        // but we need to draw all of the bad bytes using the '?' symbol so to not skip any we set `next = 1`
+        if(letter == 0x3f) next = 1; 
+        i += next - 1;  
+        
+        if (letter != '\n')
         {
-            if ((unsigned char)text[i] == 0xc2)         // UTF-8 encoding identification
-            {
-                // Support UTF-8 encoded values from [0xc2 0x80] -> [0xc2 0xbf](¿)
-                letter = (unsigned char)text[i + 1];
-                index = GetGlyphIndex(font, (int)letter);
-                i++;
-            }
-            else if ((unsigned char)text[i] == 0xc3)    // UTF-8 encoding identification
-            {
-                // Support UTF-8 encoded values from [0xc3 0x80](À) -> [0xc3 0xbf](ÿ)
-                letter = (unsigned char)text[i + 1];
-                index = GetGlyphIndex(font, (int)letter + 64);
-                i++;
-            }
-            else index = GetGlyphIndex(font, (unsigned char)text[i]);
-
+            index = GetGlyphIndex(font, letter);
             if (font.chars[index].advanceX != 0) textWidth += font.chars[index].advanceX;
             else textWidth += (font.chars[index].rec.width + font.chars[index].offsetX);
         }
@@ -1065,6 +1129,23 @@ unsigned int TextLength(const char *text)
     return length;
 }
 
+// Returns total number of characters(codepoints) in a UTF8 encoded `text` until `\0` is found.
+// NOTE: If a invalid UTF8 sequence is encountered a `?`(0x3f) codepoint is counted instead. 
+unsigned int TextCountCodepoints(const char *text) 
+{
+    unsigned int len = 0;
+    char* ptr = (char*)&text[0];
+    while(*ptr != '\0')
+    {
+        int next = 0;
+        int letter = GetNextCodepoint(ptr, &next);
+        if(letter == 0x3f) ptr += 1;
+        else ptr += next;
+        ++len;
+    }
+    return len;
+}
+
 // Formatting of text with variables to 'embed'
 const char *TextFormat(const char *text, ...)
 {
author	Demizdor <avalorrian@gmail.com>	2019-04-21 12:27:46 +0300
committer	Demizdor <avalorrian@gmail.com>	2019-04-21 12:27:46 +0300
commit	f3a5a6871d4ec005026817e5f2579a6f55938dc2 (patch)
tree	824844999de400efeca863d7b0051018dc6f1536
parent	8c22f685d168000eabfc994a09c3a2a61f7f633f (diff)
download	raylib-f3a5a6871d4ec005026817e5f2579a6f55938dc2.tar.gz raylib-f3a5a6871d4ec005026817e5f2579a6f55938dc2.zip