encoding.inl - エンジンの C API リファレンス

encoding.inl
  1. #include "assert.h"
  2. #include "array.h"
  3. #include <wchar.h>
  4. #include <string.h>
  5. namespace stingray_plugin_foundation {
  6. inline char *encoding::utf8_encode(int c, char *utf8)
  7. {
  8. if (c<0x80) {
  9. *utf8 = c;
  10. return utf8 + 1;
  11. } else if (c<0x800) {
  12. utf8[0] = (c>>6)|0xC0;
  13. utf8[1] = (c&0x3F)|0x80;
  14. return utf8 + 2;
  15. } else if (c<0x10000) {
  16. utf8[0] = (c>>12)|0xE0;
  17. utf8[1] = ((c>>6)&0x3F)|0x80;
  18. utf8[2] = (c&0x3F)|0x80;
  19. return utf8 + 3;
  20. } else if (c<0x110000) {
  21. utf8[0] = (c>>18)|0xf0;
  22. utf8[1] = ((c>>12)&0x3F)|0x80;
  23. utf8[2] = ((c>>6)&0x3F)|0x80;
  24. utf8[3] = (c&0x3F)|0x80;
  25. return utf8 + 4;
  26. } else {
  27. XERROR("Cannot encode character");
  28. }
  29. return utf8;
  30. }
  31. inline const char *encoding::utf8_decode(const char *utf8, int &codepoint)
  32. {
  33. char c = *utf8;
  34. if ((c&0x80)==0x0) {
  35. codepoint = c;
  36. return utf8 + 1;
  37. } else if ((c&0xE0)==0xC0) {
  38. unsigned char d = utf8[1];
  39. XASSERT((d&0xC0)==0x80, "Archive not utf-8 encoded %s", utf8);
  40. codepoint = (static_cast<int>(c&0x1F)<<6) | static_cast<int>(d&0x3F);
  41. return utf8 + 2;
  42. } else if ((c&0xF0)==0xE0) {
  43. const char *d = utf8 + 1;
  44. XASSERT(((d[0]&0xC0)==0x80) && ((d[1]&0xC0)==0x80), "Archive not utf-8 encoded %s", utf8);
  45. codepoint = (static_cast<int>(c&0x0f)<<12) | (static_cast<int>(d[0]&0x3f)<<6) |
  46. static_cast<int>(d[1]&0x3f);
  47. return utf8 + 3;
  48. } else if ((c&0xf8)==0xf0) {
  49. const char *d = utf8 + 1;
  50. XASSERT(((d[0]&0xc0)==0x80) && ((d[1]&0xc0)==0x80) &&
  51. ((d[2]&0xc0)==0x80), "Archive not utf-8 encoded %s", utf8);
  52. codepoint = (static_cast<int>(c&0x07)<<18) | (static_cast<int>(d[0]&0x3f)<<12) |
  53. (static_cast<int>(d[1]&0x3f)<<6) | static_cast<int>(d[2]&0x3f);
  54. return utf8 + 4;
  55. } else {
  56. XERROR("Archive not utf-8 encoded %s", utf8);
  57. return utf8;
  58. }
  59. }
  60. inline unsigned encoding::utf8_codepoint_bytes(const char *buffer)
  61. {
  62. char c = *buffer;
  63. if ((c&0x80)==0x0)
  64. return 1;
  65. else if ((c&0xE0)==0xC0)
  66. return 2;
  67. else if ((c&0xF0)==0xE0)
  68. return 3;
  69. else if ((c&0xf8)==0xf0)
  70. return 4;
  71. else {
  72. XERROR("Length on part of utf-8 character");
  73. return 1;
  74. }
  75. }
  76. inline void encoding::utf8_decode(const char *utf8, Array<unsigned> &codepoints)
  77. {
  78. while (*utf8) {
  79. int c;
  80. utf8 = utf8_decode(utf8, c);
  81. codepoints.push_back((unsigned)c);
  82. }
  83. }
  84. inline void encoding::utf8_encode(const Array<unsigned> &codepoints, Array<char> &utf8)
  85. {
  86. utf8_encode(codepoints.begin(), codepoints.size(), utf8);
  87. }
  88. inline void encoding::utf8_encode(const unsigned *codepoints, unsigned size, Array<char> &utf8)
  89. {
  90. const unsigned *it(codepoints), *end(codepoints + size);
  91. for(; it != end; ++it) {
  92. utf8.reserve(utf8.size() + 4);
  93. const char *new_ptr = utf8_encode(*it, utf8.end());
  94. utf8.resize((unsigned)(new_ptr - utf8.begin()));
  95. }
  96. }
  97. inline void encoding::utf8_location(const char *utf8, unsigned index, unsigned &begin, unsigned &end)
  98. {
  99. XASSERT(index < strlen(utf8), "Index out of string");
  100. // walk backwards to find beginning of utf8 character
  101. begin = index;
  102. while(true) {
  103. if (begin == 0)
  104. break;
  105. if ((utf8[begin] & 0xc0) != 0x80)
  106. break;
  107. --begin;
  108. }
  109. end = begin + utf8_codepoint_bytes(utf8 + begin);
  110. }
  111. inline const char * encoding::utf8_valid_first(const char *utf8)
  112. {
  113. char c = *utf8;
  114. const char *d = utf8 + 1;
  115. if ((c&0x80)==0x0) return utf8 + 1;
  116. else if ((c&0xE0)==0xC0) return (d[0]&0xC0)==0x80 ? utf8+2 : nullptr;
  117. else if ((c&0xF0)==0xE0) return (d[0]&0xC0)==0x80 && (d[1]&0xC0)==0x80 ? utf8+3 : nullptr;
  118. else if ((c&0xf8)==0xf0) return (d[0]&0xc0)==0x80 && (d[1]&0xc0)==0x80 && (d[2]&0xc0)==0x80 ? utf8+4 : nullptr;
  119. else return nullptr;
  120. }
  121. inline bool encoding::utf8_valid_all(const char *utf8)
  122. {
  123. while (*utf8) {
  124. utf8 = utf8_valid_first(utf8);
  125. if (!utf8)
  126. return false;
  127. }
  128. return true;
  129. }
  130. inline unsigned encoding::wstr_to_utf8_bytes(const wchar_t *ucs2)
  131. {
  132. char buffer[5];
  133. unsigned size = 0;
  134. while (*ucs2) {
  135. size += (unsigned)(utf8_encode(*ucs2, &buffer[0]) - &buffer[0]);
  136. ++ucs2;
  137. }
  138. ++size;
  139. return size;
  140. }
  141. inline void encoding::wstr_to_utf8(const wchar_t *ucs2, char *utf8, unsigned size)
  142. {
  143. char *b = utf8;
  144. while (*ucs2) {
  145. b = utf8_encode(*ucs2, b);
  146. ++ucs2;
  147. }
  148. XENSURE(unsigned(b - utf8) < size);
  149. *b = 0;
  150. }
  151. inline void encoding::wstr_to_utf8(const wchar_t *ucs2, Array<char> &utf8)
  152. {
  153. unsigned n = wstr_to_utf8_bytes(ucs2);
  154. utf8.resize(n);
  155. wstr_to_utf8(ucs2, utf8.begin(), n);
  156. }
  157. inline unsigned encoding::utf8_to_wstr_tokens(const char *utf8)
  158. {
  159. unsigned tokens = 0;
  160. while (*utf8) {
  161. ++tokens;
  162. utf8 += utf8_codepoint_bytes(utf8);
  163. }
  164. ++tokens;
  165. return tokens;
  166. }
  167. inline void encoding::utf8_to_wstr(const char *utf8, wchar_t *ucs2, unsigned tokens)
  168. {
  169. wchar_t *b = ucs2;
  170. while (*utf8) {
  171. int c;
  172. utf8 = utf8_decode(utf8, c);
  173. *b = c;
  174. ++b;
  175. }
  176. XENSURE(unsigned(b - ucs2) < tokens);
  177. *b = 0;
  178. }
  179. inline void encoding::utf8_to_wstr(const char *utf8, Array<wchar_t> &ucs2)
  180. {
  181. unsigned n = utf8_to_wstr_tokens(utf8);
  182. ucs2.resize(n);
  183. utf8_to_wstr(utf8, ucs2.begin(), n);
  184. }
  185. inline wchar_t *encoding::utf8_to_wstr(const char *utf8, Allocator &a)
  186. {
  187. unsigned n = utf8_to_wstr_tokens(utf8);
  188. wchar_t *res = (wchar_t *)a.allocate(sizeof(wchar_t)*n);
  189. utf8_to_wstr(utf8, res, n);
  190. return res;
  191. }
  192. } // namespace stingray_plugin_foundation