SQL Serverに列があります utf8 SQL_Latin1_General_CP1_CI_ASエンコーディング。 ISO 8859-1エンコーディングでテキストを変換して保存するにはどうすればよいですか? SQL Serverでクエリを実行したいのですが。任意のヒント?
オレ。 Gostei do jogo。 Quando "baixei"atéachei quenão iria curtir muito
varchar
フィールドに格納されているUTF-8テキストを修復する関数を作成しました。
固定値を確認するには、次のように使用できます。
CREATE TABLE #Table1 (Column1 varchar(max))
INSERT #Table1
VALUES ('Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito')
SELECT *, NewColumn1 = dbo.DecodeUTF8String(Column1)
FROM Table1
WHERE Column1 <> dbo.DecodeUTF8String(Column1)
出力:
Column1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito
NewColumn1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito
コード:
CREATE FUNCTION dbo.DecodeUTF8String (@value varchar(max))
RETURNS nvarchar(max)
AS
BEGIN
-- Transforms a UTF-8 encoded varchar string into Unicode
-- By Anthony Faull 2014-07-31
DECLARE @result nvarchar(max);
-- If ASCII or null there's no work to do
IF (@value IS NULL
OR @value NOT LIKE '%[^ -~]%' COLLATE Latin1_General_BIN
)
RETURN @value;
-- Generate all integers from 1 to the length of string
WITH e0(n) AS (SELECT TOP(POWER(2,POWER(2,0))) NULL FROM (VALUES (NULL),(NULL)) e(n))
, e1(n) AS (SELECT TOP(POWER(2,POWER(2,1))) NULL FROM e0 CROSS JOIN e0 e)
, e2(n) AS (SELECT TOP(POWER(2,POWER(2,2))) NULL FROM e1 CROSS JOIN e1 e)
, e3(n) AS (SELECT TOP(POWER(2,POWER(2,3))) NULL FROM e2 CROSS JOIN e2 e)
, e4(n) AS (SELECT TOP(POWER(2,POWER(2,4))) NULL FROM e3 CROSS JOIN e3 e)
, e5(n) AS (SELECT TOP(POWER(2.,POWER(2,5)-1)-1) NULL FROM e4 CROSS JOIN e4 e)
, numbers(position) AS
(
SELECT TOP(DATALENGTH(@value)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM e5
)
-- UTF-8 Algorithm (http://en.wikipedia.org/wiki/UTF-8)
-- For each octet, count the high-order one bits, and extract the data bits.
, octets AS
(
SELECT position, highorderones, partialcodepoint
FROM numbers a
-- Split UTF8 string into rows of one octet each.
CROSS APPLY (SELECT octet = ASCII(SUBSTRING(@value, position, 1))) b
-- Count the number of leading one bits
CROSS APPLY (SELECT highorderones = 8 - FLOOR(LOG( ~CONVERT(tinyint, octet) * 2 + 1)/LOG(2))) c
CROSS APPLY (SELECT databits = 7 - highorderones) d
CROSS APPLY (SELECT partialcodepoint = octet % POWER(2, databits)) e
)
-- Compute the Unicode codepoint for each sequence of 1 to 4 bytes
, codepoints AS
(
SELECT position, codepoint
FROM
(
-- Get the starting octect for each sequence (i.e. exclude the continuation bytes)
SELECT position, highorderones, partialcodepoint
FROM octets
WHERE highorderones <> 1
) lead
CROSS APPLY (SELECT sequencelength = CASE WHEN highorderones in (1,2,3,4) THEN highorderones ELSE 1 END) b
CROSS APPLY (SELECT endposition = position + sequencelength - 1) c
CROSS APPLY
(
-- Compute the codepoint of a single UTF-8 sequence
SELECT codepoint = SUM(POWER(2, shiftleft) * partialcodepoint)
FROM octets
CROSS APPLY (SELECT shiftleft = 6 * (endposition - position)) b
WHERE position BETWEEN lead.position AND endposition
) d
)
-- Concatenate the codepoints into a Unicode string
SELECT @result = CONVERT(xml,
(
SELECT NCHAR(codepoint)
FROM codepoints
ORDER BY position
FOR XML PATH('')
)).value('.', 'nvarchar(max)');
RETURN @result;
END
GO
Jason Penny には written もあります。これは、UTF-8をUnicode(MITライセンス)に変換するSQL関数で、簡単な例を示しています。
CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX))
RETURNS NVarChar(MAX)
AS
BEGIN
DECLARE @out NVarChar(MAX), @i int, @c int, @c2 int, @c3 int, @nc int
SELECT @i = 1, @out = ''
WHILE (@i <= Len(@in))
BEGIN
SET @c = Ascii(SubString(@in, @i, 1))
IF (@c < 128)
BEGIN
SET @nc = @c
SET @i = @i + 1
END
ELSE IF (@c > 191 AND @c < 224)
BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
SET @nc = (((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
SET @i = @i + 2
END
ELSE
BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
SET @c3 = Ascii(SubString(@in, @i + 2, 1))
SET @nc = (((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
SET @i = @i + 3
END
SET @out = @out + NChar(@nc)
END
RETURN @out
END
GO
Anthonyがチェックした答えは「見た目」がよくなりますが、変換と不一致の調査を行う場合は両方実行できますか?
また、以下のvery醜いコードを使用して、UTF-8としてエンコードされたBMPページunicode文字を検出し、次にvarcharフィールドからnvarcharフィールドに変換され、UCS-16に変換できます。
LIKE (N'%[' + CONVERT(NVARCHAR,(CHAR(192))) + CONVERT(NVARCHAR,(CHAR(193))) + CONVERT(NVARCHAR,(CHAR(194))) + CONVERT(NVARCHAR,(CHAR(195))) + CONVERT(NVARCHAR,(CHAR(196))) + CONVERT(NVARCHAR,(CHAR(197))) + CONVERT(NVARCHAR,(CHAR(198))) + CONVERT(NVARCHAR,(CHAR(199))) + CONVERT(NVARCHAR,(CHAR(200))) + CONVERT(NVARCHAR,(CHAR(201))) + CONVERT(NVARCHAR,(CHAR(202))) + CONVERT(NVARCHAR,(CHAR(203))) + CONVERT(NVARCHAR,(CHAR(204))) + CONVERT(NVARCHAR,(CHAR(205))) + CONVERT(NVARCHAR,(CHAR(206))) + CONVERT(NVARCHAR,(CHAR(207))) + CONVERT(NVARCHAR,(CHAR(208))) + CONVERT(NVARCHAR,(CHAR(209))) + CONVERT(NVARCHAR,(CHAR(210))) + CONVERT(NVARCHAR,(CHAR(211))) + CONVERT(NVARCHAR,(CHAR(212))) + CONVERT(NVARCHAR,(CHAR(213))) + CONVERT(NVARCHAR,(CHAR(214))) + CONVERT(NVARCHAR,(CHAR(215))) + CONVERT(NVARCHAR,(CHAR(216))) + CONVERT(NVARCHAR,(CHAR(217))) + CONVERT(NVARCHAR,(CHAR(218))) + CONVERT(NVARCHAR,(CHAR(219))) + CONVERT(NVARCHAR,(CHAR(220))) + CONVERT(NVARCHAR,(CHAR(221))) + CONVERT(NVARCHAR,(CHAR(222))) + CONVERT(NVARCHAR,(CHAR(223))) + CONVERT(NVARCHAR,(CHAR(224))) + CONVERT(NVARCHAR,(CHAR(225))) + CONVERT(NVARCHAR,(CHAR(226))) + CONVERT(NVARCHAR,(CHAR(227))) + CONVERT(NVARCHAR,(CHAR(228))) + CONVERT(NVARCHAR,(CHAR(229))) + CONVERT(NVARCHAR,(CHAR(230))) + CONVERT(NVARCHAR,(CHAR(231))) + CONVERT(NVARCHAR,(CHAR(232))) + CONVERT(NVARCHAR,(CHAR(233))) + CONVERT(NVARCHAR,(CHAR(234))) + CONVERT(NVARCHAR,(CHAR(235))) + CONVERT(NVARCHAR,(CHAR(236))) + CONVERT(NVARCHAR,(CHAR(237))) + CONVERT(NVARCHAR,(CHAR(238))) + CONVERT(NVARCHAR,(CHAR(239)))
+ N'][' + CONVERT(NVARCHAR,(CHAR(128))) + CONVERT(NVARCHAR,(CHAR(129))) + CONVERT(NVARCHAR,(CHAR(130))) + CONVERT(NVARCHAR,(CHAR(131))) + CONVERT(NVARCHAR,(CHAR(132))) + CONVERT(NVARCHAR,(CHAR(133))) + CONVERT(NVARCHAR,(CHAR(134))) + CONVERT(NVARCHAR,(CHAR(135))) + CONVERT(NVARCHAR,(CHAR(136))) + CONVERT(NVARCHAR,(CHAR(137))) + CONVERT(NVARCHAR,(CHAR(138))) + CONVERT(NVARCHAR,(CHAR(139))) + CONVERT(NVARCHAR,(CHAR(140))) + CONVERT(NVARCHAR,(CHAR(141))) + CONVERT(NVARCHAR,(CHAR(142))) + CONVERT(NVARCHAR,(CHAR(143))) + CONVERT(NVARCHAR,(CHAR(144))) + CONVERT(NVARCHAR,(CHAR(145))) + CONVERT(NVARCHAR,(CHAR(146))) + CONVERT(NVARCHAR,(CHAR(147))) + CONVERT(NVARCHAR,(CHAR(148))) + CONVERT(NVARCHAR,(CHAR(149))) + CONVERT(NVARCHAR,(CHAR(150))) + CONVERT(NVARCHAR,(CHAR(151))) + CONVERT(NVARCHAR,(CHAR(152))) + CONVERT(NVARCHAR,(CHAR(153))) + CONVERT(NVARCHAR,(CHAR(154))) + CONVERT(NVARCHAR,(CHAR(155))) + CONVERT(NVARCHAR,(CHAR(156))) + CONVERT(NVARCHAR,(CHAR(157))) + CONVERT(NVARCHAR,(CHAR(158))) + CONVERT(NVARCHAR,(CHAR(159))) + CONVERT(NVARCHAR,(CHAR(160))) + CONVERT(NVARCHAR,(CHAR(161))) + CONVERT(NVARCHAR,(CHAR(162))) + CONVERT(NVARCHAR,(CHAR(163))) + CONVERT(NVARCHAR,(CHAR(164))) + CONVERT(NVARCHAR,(CHAR(165))) + CONVERT(NVARCHAR,(CHAR(166))) + CONVERT(NVARCHAR,(CHAR(167))) + CONVERT(NVARCHAR,(CHAR(168))) + CONVERT(NVARCHAR,(CHAR(169))) + CONVERT(NVARCHAR,(CHAR(170))) + CONVERT(NVARCHAR,(CHAR(171))) + CONVERT(NVARCHAR,(CHAR(172))) + CONVERT(NVARCHAR,(CHAR(173))) + CONVERT(NVARCHAR,(CHAR(174))) + CONVERT(NVARCHAR,(CHAR(175))) + CONVERT(NVARCHAR,(CHAR(176))) + CONVERT(NVARCHAR,(CHAR(177))) + CONVERT(NVARCHAR,(CHAR(178))) + CONVERT(NVARCHAR,(CHAR(179))) + CONVERT(NVARCHAR,(CHAR(180))) + CONVERT(NVARCHAR,(CHAR(181))) + CONVERT(NVARCHAR,(CHAR(182))) + CONVERT(NVARCHAR,(CHAR(183))) + CONVERT(NVARCHAR,(CHAR(184))) + CONVERT(NVARCHAR,(CHAR(185))) + CONVERT(NVARCHAR,(CHAR(186))) + CONVERT(NVARCHAR,(CHAR(187))) + CONVERT(NVARCHAR,(CHAR(188))) + CONVERT(NVARCHAR,(CHAR(189))) + CONVERT(NVARCHAR,(CHAR(190))) + CONVERT(NVARCHAR,(CHAR(191)))
+ N']%') COLLATE Latin1_General_BIN
上記:
まだエンコードではなく、必要なクエリを見つけました。
ALTER TABLE dbo.MyTable ALTER COLUMN CharCol
varchar(10)COLLATE Latin1_General_CI_AS NOT NULL;
sQL Server 2017および2019の新しい文字列集計関数string_aggを使用するように少し変更を加えます
SELECT @result=STRING_AGG(NCHAR([codepoint]),'') WITHIN GROUP (ORDER BY position ASC)
FROM codepoints
@result部分をこの部分に変更します。 XMLは依然として古い方法で機能します。 2019では、string_aggはxmlバージョンよりも非常に高速に動作します(当然です... string_aggはネイティブであり、公平な比較ではありません)