文字列で使用される文字セルの数

Question

UTF-8文字列を使用してテキストテーブルを出力するプログラムがあります。文字列で使用される等幅文字セルの数を測定して、適切に配置できるようにする必要があります。できれば標準機能でやってみたいです。

Maxim Egorushkin · Accepted Answer

From TF-8 and Unicode FAQ for Unix/Linux ：

Cでは、mbstowcs(NULL,s,0)を使用して移植可能な方法で文字数をカウントできます。これは、適切なロケールが選択されている限り、サポートされている他のエンコーディングと同様にUTF-8でも機能します。 UTF-8文字列内の文字数を数えるハードワイヤード手法は、0x80〜0xBFの範囲を除くすべてのバイトを数えることです。これらは単なる継続バイトであり、独自の文字ではないためです。ただし、文字数をカウントする必要性は、アプリケーションでは驚くほどまれに発生します。

mpez0 · Answer

UTF-8互換のstrlen（3）関数を使用できる場合と使用できない場合があります。ただし、s いくつかの簡単なC関数はすぐに利用できますは、すばやく作業を行います。

効率的なCソリューションは、文字の先頭を調べて継続バイトをスキップします。単純なコード（上記のリンクから参照）は

int my_strlen_utf8_c(char *s) { int i = 0, j = 0; while (s[i]) { if ((s[i] & 0xc0) != 0x80) j++; i++; } return j; }

高速バージョンは同じ手法を使用しますが、データをプリフェッチしてマルチバイト比較を行うため、大幅なスピードアップになります。ただし、コードは長く、複雑です。

Functino · Answer

誰もこれについて言及していなかったのでショックを受けました。

端末でテキストを揃えたい場合は、POSIX関数 wcwidth および wcswidth を使用する必要があります。文字列の画面上の長さを見つけるための正しいプログラムを次に示します。

#define _XOPEN_SOURCE #include <wchar.h> #include <stdio.h> #include <locale.h> #include <stdlib.h> int measure(char *string) { // allocate enough memory to hold the wide string size_t needed = mbstowcs(NULL, string, 0) + 1; wchar_t *wcstring = malloc(needed * sizeof *wcstring); if (!wcstring) return -1; // change encodings if (mbstowcs(wcstring, string, needed) == (size_t)-1) return -2; // measure width int width = wcswidth(wcstring, needed); free(wcstring); return width; } int main(int argc, char **argv) { setlocale(LC_ALL, ""); for (int i = 1; i < argc; i++) { printf("%s: %d
", argv[i], measure(argv[i])); } }

これが実行中の例です：

$ ./measure hello 莊子 cＡb hello: 5 莊子: 4 cＡb: 4

2つの文字「zon子」と3つの文字「cＡb」（全角Ａに注意）はどちらも4カラム幅です。

Utf8everywhere.org それを置く、

画面に表示される文字列のサイズは、文字列内のコードポイントの数とは関係ありません。そのためには、レンダリングエンジンと通信する必要があります。モノスペースフォントと端末でも、コードポイントは1列を占めません。 POSIXはこれを考慮に入れます。

Windowsには、コンソール出力用の組み込みwcwidth関数がありません。 Windowsコンソールで複数列の文字をサポートする場合 ~~wcwidthの移植可能な実装を見つける必要があります~~ WindowsコンソールはクレイジーなハックなしではUnicodeをサポートしないため、あきらめてください。

Nick · Answer

サードパーティのライブラリを使用できる場合は、IBMのICUライブラリをご覧ください。

http://site.icu-project.org/

masakielastic · Answer

次のコードは、不正なバイトシーケンスを考慮しています。文字列データの例は、Unicode Standard 6.3の「 "表3-8。U + FFFDのUTF-8変換での使用" "から取得されます。

#include <stdio.h> #include <stdlib.h> #include <string.h> #include <stdbool.h> #define is_trail(c) (c > 0x7F && c < 0xC0) #define SUCCESS 1 #define FAILURE -1 int utf8_get_next_char(const unsigned char*, size_t, size_t*, int*, unsigned int*); int utf8_length(unsigned char*, size_t); void utf8_print_each_char(unsigned char*, size_t); int main(void) { unsigned char *str; str = (unsigned char *) "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64"; size_t str_size = strlen((const char*) str); puts(10 == utf8_length(str, str_size) ? "true" : "false"); utf8_print_each_char(str, str_size); return EXIT_SUCCESS; } int utf8_length(unsigned char *str, size_t str_size) { int length = 0; size_t pos = 0; size_t next_pos = 0; int is_valid = 0; unsigned int code_point = 0; while ( utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS ) { ++length; } return length; } void utf8_print_each_char(unsigned char *str, size_t str_size) { int length = 0; size_t pos = 0; size_t next_pos = 0; int is_valid = 0; unsigned int code_point = 0; while ( utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS ) { if (is_valid == true) { printf("%.*s
", (int) next_pos - (int) pos, str + pos); } else { puts("\xEF\xBF\xBD"); } pos = next_pos; } } int utf8_get_next_char(const unsigned char *str, size_t str_size, size_t *cursor, int *is_valid, unsigned int *code_point) { size_t pos = *cursor; size_t rest_size = str_size - pos; unsigned char c; unsigned char min; unsigned char max; *code_point = 0; *is_valid = SUCCESS; if (*cursor >= str_size) { return FAILURE; } c = str[pos]; if (rest_size < 1) { *is_valid = false; pos += 1; } else if (c < 0x80) { *code_point = str[pos]; *is_valid = true; pos += 1; } else if (c < 0xC2) { *is_valid = false; pos += 1; } else if (c < 0xE0) { if (rest_size < 2 || !is_trail(str[pos + 1])) { *is_valid = false; pos += 1; } else { *code_point = ((str[pos] & 0x1F) << 6) | (str[pos + 1] & 0x3F); *is_valid = true; pos += 2; } } else if (c < 0xF0) { min = (c == 0xE0) ? 0xA0 : 0x80; max = (c == 0xED) ? 0x9F : 0xBF; if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) { *is_valid = false; pos += 1; } else if (rest_size < 3 || !is_trail(str[pos + 2])) { *is_valid = false; pos += 2; } else { *code_point = ((str[pos] & 0x1F) << 12) | ((str[pos + 1] & 0x3F) << 6) | (str[pos + 2] & 0x3F); *is_valid = true; pos += 3; } } else if (c < 0xF5) { min = (c == 0xF0) ? 0x90 : 0x80; max = (c == 0xF4) ? 0x8F : 0xBF; if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) { *is_valid = false; pos += 1; } else if (rest_size < 3 || !is_trail(str[pos + 2])) { *is_valid = false; pos += 2; } else if (rest_size < 4 || !is_trail(str[pos + 3])) { *is_valid = false; pos += 3; } else { *code_point = ((str[pos] & 0x7) << 18) | ((str[pos + 1] & 0x3F) << 12) | ((str[pos + 2] & 0x3F) << 6) | (str[pos + 3] & 0x3F); *is_valid = true; pos += 4; } } else { *is_valid = false; pos += 1; } *cursor = pos; return SUCCESS; }

UTF-8のコードを作成すると、Unicode Standard 6.3の「表3-7。整形式のUTF-8バイトシーケンス」が表示されます。

 Code Points First Byte Second Byte Third Byte Fourth Byte U+0000 - U+007F 00 - 7F U+0080 - U+07FF C2 - DF 80 - BF U+0800 - U+0FFF E0 A0 - BF 80 - BF U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF U+D000 - U+D7FF ED 80 - 9F 80 - BF U+E000 - U+FFFF EE - EF 80 - BF 80 - BF U+10000 - U+3FFFF F0 90 - BF 80 - BF 80 - BF U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF U+100000 - U+10FFFF F4 80 - 8F 80 - BF 80 - BF

lmedinas · Answer

また、UTF-8を処理する際に非常に使いやすいglibを使用することもできます。 glibリファレンスドキュメント