私が作業しているテーブルには、3つのコンポーネントがあります。
ID
列(別のテーブルの主キー)from
/to
列。値:
ID Data From To
1 a 2015-01-01 2015-01-05
1 a 2015-01-06 2015-01-10
1 b 2015-01-11 2015-01-15
1 a 2015-01-16 2015-01-20
2 c 2015-01-01 2015-01-05
2 c 2015-01-06 2015-01-10
テーブルは、一定の間隔で別のデータソースの「スナップショット」を取得し、有効期限をレコードに割り当てることによって更新されます。問題は、これらのスナップショットが、その間隔中にまったく変更されなかった(異なる有効日付を持つ)レコードの重複エントリを作成することです。
連続した日付を持つ行を探し、それらをマージして単一の有効期間を割り当てることにより、テーブルのサイズを小さくしたいと考えています。例えば:
ID Data From To
1 a 2015-01-01 2015-01-10
1 b 2015-01-11 2015-01-15
1 a 2015-01-16 2015-01-20
2 c 2015-01-01 2015-01-10
私が現在持っているロジックは:
カーソルは非常に効率が悪い(データセットが大きい)ことを理解しているので、他の方法を探しています。
これが連続する範囲のみの表である場合、ケースは古典的な「ギャップとアイランド」の問題として扱うことができます。この場合、連続する範囲のアイランドを分離し、最小値をとることによってそれらを「圧縮」するだけです[from]
と最大値[to]
島ごと。
2つのROW_NUMBER呼び出しを使用してこれを解決する確立された方法があります。
WITH islands AS
(
SELECT
id,
data,
[from],
[to],
island = ROW_NUMBER() OVER (PARTITION BY id ORDER BY [from])
- ROW_NUMBER() OVER (PARTITION BY id, data ORDER BY [from])
FROM
#mergeTest
)
SELECT
id,
data,
[from] = MIN([from]),
[to] = MAX([to])
FROM
islands
GROUP BY
id,
data,
island
;
このクエリは、SQL Server 2005と同じ低バージョンで機能します。
この問題を解決するクエリを作成できました。複数の結合とwhileループを使用してレコードをマージします。このコードは、SQL Server 2008 R2と互換性があります。
CREATE TABLE #mergeTest
(
[id] int NOT NULL,
[data] date,
[from] date NOT NULL,
[to] date NOT NULL
);
INSERT INTO #mergeTest ([id],[data],[from],[to]) VALUES --testing null data value handling
(1,NULL,'2015-01-01','2015-01-05'), --1
(1,NULL,'2015-01-05','2015-01-10'), --2
(1,'2000-01-01','2015-01-10','2015-01-14'), --3
(1,'2000-01-03','2015-01-14','2015-01-15'), --4
(1,'2000-01-01','2015-01-15','2015-01-20'), --5
(1,'2000-01-01','2015-01-20','2015-01-22'), --5
(1,'2000-01-01','2015-01-22','2015-01-25'), --6
(1,'2000-01-01','2015-01-25','2015-01-30'), --7
(1,NULL,'2015-01-30','2015-02-04'), --8
(2,'2000-01-05','2015-01-01','2015-01-05'), --9
(2,'2000-01-05','2015-01-05','2015-01-10') --10
SELECT * FROM #mergeTest
GO
;
SELECT * INTO #tempSingle --isolate single records. Single records need no processing.
FROM (
SELECT [id], [data], MIN([from]) as [from], MIN([to]) as [to],
COUNT([id]) as [grpsz]
FROM #mergeTest
GROUP BY [id], [data]) AS [selection]
WHERE [grpsz]=1;
ALTER TABLE #tempSingle
DROP COLUMN [grpsz];
GO
;
SELECT * INTO #tempRemainingtemp --isolate records w/ more than 2 entries. They need to be reduced to single records
FROM (
SELECT [id], [data], --get [id] and [data] of duplicate records
COUNT([id]) as [grpsz]
FROM #mergeTest
GROUP BY [id], [data]) AS [selection]
WHERE [grpsz]>=2;
ALTER TABLE #tempRemainingTemp
DROP COLUMN [grpsz]
SELECT * FROM #tempRemainingtemp
SELECT * INTO #temp --get all duplicate records into #temp
FROM (
SELECT [b].*
FROM #tempRemainingtemp AS [a]
JOIN #mergeTest AS [b]
ON [a].[id]=[b].[id]
AND ([a].[data]=[b].[data] OR [a].[data] IS NULL AND [b].[data] IS NULL)) AS [selection];
DROP TABLE #tempRemainingtemp;
Go
SELECT * INTO #tempRemaining
FROM #temp;
DROP TABLE #temp;
GO
;
SELECT * FROM #tempRemaining
BEGIN
SELECT t1.*, t2.[from] as [prevfrom] INTO #temp0 --filter in records where previous 'to' date matched current 'from' date when grouped by id and data
FROM #tempRemaining AS t1
JOIN #tempRemaining AS t2
ON t2.[to] = t1.[from]
AND t1.[id] = t2.[id]
AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
SELECT t1.*, t2.[prevfrom] INTO #temp1 --add records that did not have a previous 'to' date b/c they were the extreme records in their group
FROM #tempRemaining AS t1
LEFT JOIN #temp0 AS t2
ON t1.[id]=t2.[id]
AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
AND t1.[from] = t2.[from];
DROP TABLE #temp0;
SELECT t1.*, t2.[to] as [nextto] INTO #temp2 --filter in records where current 'to' date matched next 'from' date when grouped by id and data
FROM #temp1 AS t1
JOIN #temp1 AS t2
ON t2.[from] = t1.[to]
AND t1.[id] = t2.[id]
AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL);
SELECT t1.*, t2.[nextto] INTO #temp --add records that did not have a next 'from' date b/c they were the extreme records in their group
FROM #temp1 AS t1
LEFT JOIN #temp2 AS t2
ON t1.[id]=t2.[id]
AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
AND t1.[from] = t2.[from];
DROP TABLE #temp2;
DROP TABLE #temp1;
DELETE FROM #temp --delete redundant records
WHERE [prevfrom] IS NOT NULL
AND [nextto] IS NOT NULL;
WITH cte AS ( --select records that got reduced to singles and insert them into singles account
SELECT [id], [data], [from], [to]
FROM [#temp]
WHERE [prevfrom] IS NULL
AND [nextto] IS NULL)
DELETE FROM cte
OUTPUT deleted.* INTO #tempSingle
/* ALL DUPLICATE RECORDS ARE NOW REDUCED TO PAIRS*/
SELECT * FROM #temp;
ALTER TABLE #temp
DROP COLUMN [nextto],[prevfrom] --remove helper columns
END
SELECT TOP 1 * INTO #temptemp --create temporary tables for storage
FROM #temp
SELECT TOP 1 * INTO #tempResult
FROM #temp
TRUNCATE TABLE #temptemp
TRUNCATE TABLE #tempResult
WHILE EXISTS(SELECT [id] from #temp)
BEGIN
WITH cte AS (
SELECT TOP 2 * --select pair
FROM #temp
ORDER BY [id],[data],[from])
DELETE FROM cte --delete from original table
OUTPUT deleted.* INTO #temptemp;
INSERT INTO #tempResult --insert merged record into result table
SELECT t1.[id], t1.[data], t1.[from], t2.[to]
FROM #temptemp AS t1
JOIN #temptemp AS t2
ON t1.[from]<t2.[from];
TRUNCATE TABLE #temptemp; --empty temporary storage table
END;
TRUNCATE TABLE #mergeTest; --insert single records and merged records into original table
INSERT INTO #mergeTest
SELECT * FROM #tempResult;
INSERT INTO #mergeTest
SELECT * FROM #tempSingle;
SELECT * FROM #mergeTest
ORDER BY [id],[from];
連続していても別々にしておく必要がある、連続していない日付範囲がある場合のために、私はこの解決策を思いつきました:
WITH lag_info AS (
SELECT
ID,
Data,
[From],
[To],
lag([To], 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevTo,
lag(Data, 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevData
FROM dat
),
segmented AS (
SELECT
ID,
Data,
[From],
[To],
-- new interval if non-contigous or data changed
-- if it's null, it means that it's the first entry for the ID, which means it's a new interval
CASE
WHEN [PrevTo] IS NULL
OR PrevData IS NULL
OR DATEDIFF(DAY, [PrevTo], [From]) > 1
OR Data <> PrevData
THEN 1
ELSE 0
END AS is_new_interval
FROM lag_info
),
segmented_marked AS (
SELECT
ID,
[From],
[To],
Data,
-- increment only when new data is detected, using a running sum
sum(s.is_new_interval)
OVER (PARTITION BY ID ORDER BY [From] ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING)
AS interval_id
FROM segmented s
)
SELECT
ID,
min([From]) AS [From],
max([To]) AS [To],
Data
FROM segmented_marked
GROUP BY ID, Data, interval_id