Gzip圧縮を使用してJava)で入力ストリームを圧縮したいと思います。
圧縮されていない入力ストリーム(1GBのデータ..)があるとしましょう。結果として、ソースからの圧縮された入力ストリームが必要です。
public InputStream getCompressedStream(InputStream unCompressedStream) {
// Not working because it's uncompressing the stream, I want the opposite.
return new GZIPInputStream(unCompressedStream);
}
DeflaterInputStreamは、gzipヘッダー/トレーラーがなく、わずかに異なる圧縮を使用しているため、必要なものではありません。
OutputStream(プッシュ)からInputStream(プル)に変更する場合は、別のことを行う必要があります。
GzipOutputStreamの機能は次のとおりです。
InputStreamsで同じことをしたい場合は、以下を含むストリームが必要です。
これを行う最良の方法は、3つの異なるストリームを提供し、それらを1つに結合することです。幸い、ストリームの結合を行うSequenceInputStreamがあります。
これが私の実装と簡単な単体テストです:
import Java.io.ByteArrayInputStream;
import Java.io.FileInputStream;
import Java.io.FilterInputStream;
import Java.io.IOException;
import Java.io.InputStream;
import Java.io.SequenceInputStream;
import Java.util.Enumeration;
import Java.util.Zip.CRC32;
import Java.util.Zip.Deflater;
import Java.util.Zip.DeflaterInputStream;
import Java.util.Zip.DeflaterOutputStream;
/**
* @author mwyraz
* Wraps an input stream and compresses it's contents. Similiar to DeflateInputStream but adds GZIP-header and trailer
* See GzipOutputStream for details.
* LICENSE: Free to use. Contains some lines from GzipOutputStream, so Oracle's license might apply as well!
*/
public class GzipCompressingInputStream extends SequenceInputStream
{
public GzipCompressingInputStream(InputStream in) throws IOException
{
this(in,512);
}
public GzipCompressingInputStream(InputStream in, int bufferSize) throws IOException
{
super(new StatefullGzipStreamEnumerator(in,bufferSize));
}
static enum StreamState
{
HEADER,
CONTENT,
TRAILER
}
protected static class StatefullGzipStreamEnumerator implements Enumeration<InputStream>
{
protected final InputStream in;
protected final int bufferSize;
protected StreamState state;
public StatefullGzipStreamEnumerator(InputStream in, int bufferSize)
{
this.in=in;
this.bufferSize=bufferSize;
state=StreamState.HEADER;
}
public boolean hasMoreElements()
{
return state!=null;
}
public InputStream nextElement()
{
switch (state)
{
case HEADER:
state=StreamState.CONTENT;
return createHeaderStream();
case CONTENT:
state=StreamState.TRAILER;
return createContentStream();
case TRAILER:
state=null;
return createTrailerStream();
}
return null;
}
static final int GZIP_MAGIC = 0x8b1f;
static final byte[] GZIP_HEADER=new byte[] {
(byte) GZIP_MAGIC, // Magic number (short)
(byte)(GZIP_MAGIC >> 8), // Magic number (short)
Deflater.DEFLATED, // Compression method (CM)
0, // Flags (FLG)
0, // Modification time MTIME (int)
0, // Modification time MTIME (int)
0, // Modification time MTIME (int)
0, // Modification time MTIME (int)
0, // Extra flags (XFLG)
0 // Operating system (OS)
};
protected InputStream createHeaderStream()
{
return new ByteArrayInputStream(GZIP_HEADER);
}
protected InternalGzipCompressingInputStream contentStream;
protected InputStream createContentStream()
{
contentStream=new InternalGzipCompressingInputStream(new CRC32InputStream(in), bufferSize);
return contentStream;
}
protected InputStream createTrailerStream()
{
return new ByteArrayInputStream(contentStream.createTrailer());
}
}
/**
* Internal stream without header/trailer
*/
protected static class CRC32InputStream extends FilterInputStream
{
protected CRC32 crc = new CRC32();
protected long byteCount;
public CRC32InputStream(InputStream in)
{
super(in);
}
@Override
public int read() throws IOException
{
int val=super.read();
if (val>=0)
{
crc.update(val);
byteCount++;
}
return val;
}
@Override
public int read(byte[] b, int off, int len) throws IOException
{
len=super.read(b, off, len);
if (len>=0)
{
crc.update(b,off,len);
byteCount+=len;
}
return len;
}
public long getCrcValue()
{
return crc.getValue();
}
public long getByteCount()
{
return byteCount;
}
}
/**
* Internal stream without header/trailer
*/
protected static class InternalGzipCompressingInputStream extends DeflaterInputStream
{
protected final CRC32InputStream crcIn;
public InternalGzipCompressingInputStream(CRC32InputStream in, int bufferSize)
{
super(in, new Deflater(Deflater.DEFAULT_COMPRESSION, true),bufferSize);
crcIn=in;
}
public void close() throws IOException
{
if (in != null)
{
try
{
def.end();
in.close();
}
finally
{
in = null;
}
}
}
protected final static int TRAILER_SIZE = 8;
public byte[] createTrailer()
{
byte[] trailer= new byte[TRAILER_SIZE];
writeTrailer(trailer, 0);
return trailer;
}
/*
* Writes GZIP member trailer to a byte array, starting at a given
* offset.
*/
private void writeTrailer(byte[] buf, int offset)
{
writeInt((int)crcIn.getCrcValue(), buf, offset); // CRC-32 of uncompr. data
writeInt((int)crcIn.getByteCount(), buf, offset + 4); // Number of uncompr. bytes
}
/*
* Writes integer in Intel byte order to a byte array, starting at a
* given offset.
*/
private void writeInt(int i, byte[] buf, int offset)
{
writeShort(i & 0xffff, buf, offset);
writeShort((i >> 16) & 0xffff, buf, offset + 2);
}
/*
* Writes short integer in Intel byte order to a byte array, starting
* at a given offset
*/
private void writeShort(int s, byte[] buf, int offset)
{
buf[offset] = (byte)(s & 0xff);
buf[offset + 1] = (byte)((s >> 8) & 0xff);
}
}
}
import static org.junit.Assert.*;
import Java.io.ByteArrayInputStream;
import Java.io.IOException;
import Java.io.InputStream;
import Java.util.Arrays;
import Java.util.Zip.CRC32;
import Java.util.Zip.GZIPInputStream;
import org.junit.Test;
public class TestGzipCompressingInputStream
{
@Test
public void test() throws Exception
{
testCompressor("test1 test2 test3");
testCompressor("1MB binary data",createTestPattern(1024*1024));
for (int i=0;i<4096;i++)
{
testCompressor(i+" bytes of binary data",createTestPattern(i));
}
}
protected byte[] createTestPattern(int size)
{
byte[] data=new byte[size];
byte pattern=0;
for (int i=0;i<size;i++)
{
data[i]=pattern++;
}
return data;
}
protected void testCompressor(String data) throws IOException
{
testCompressor("String: "+data,data.getBytes());
}
protected void testCompressor(String dataInfo, byte[] data) throws IOException
{
InputStream uncompressedIn=new ByteArrayInputStream(data);
InputStream compressedIn=new GzipCompressingInputStream(uncompressedIn);
InputStream uncompressedOut=new GZIPInputStream(compressedIn);
byte[] result=StreamHelper.readBinaryStream(uncompressedOut);
assertTrue("Test failed for: "+dataInfo,Arrays.equals(data,result));
}
}
コンテンツを大きなバイト配列にロードしたくなく、真のストリーミングソリューションが必要な場合:
package x.y.z;
import org.Apache.commons.io.IOUtils;
import Java.io.*;
import Java.util.Arrays;
import Java.util.List;
import Java.util.concurrent.ExecutorService;
import Java.util.concurrent.Executors;
import Java.util.Zip.GZIPInputStream;
import Java.util.Zip.GZIPOutputStream;
import Java.util.Zip.ZipOutputStream;
/**
* Stream Compression Utility
*
* @author Thamme Gowda N
*/
public enum CompressionUtil {
INSTANCE;
public static final int NUM_THREADS = 5;
private final ExecutorService pool;
CompressionUtil(){
this.pool = Executors.newFixedThreadPool(NUM_THREADS);
}
public static CompressionUtil getInstance(){
return INSTANCE;
}
/**
* Supported compression type names
*/
public static enum CompressionType {
GZIP,
Zip
}
/**
* Wraps the given stream in a Compressor stream based on given type
* @param sourceStream : Stream to be wrapped
* @param type : Compression type
* @return source stream wrapped in a compressor stream
* @throws IOException when some thing bad happens
*/
public static OutputStream getCompressionWrapper(OutputStream sourceStream,
CompressionType type) throws IOException {
switch (type) {
case GZIP:
return new GZIPOutputStream(sourceStream);
case Zip:
return new ZipOutputStream(sourceStream);
default:
throw new IllegalArgumentException("Possible values :"
+ Arrays.toString(CompressionType.values()));
}
}
/**
* Gets Compressed Stream for given input Stream
* @param sourceStream : Input Stream to be compressed to
* @param type: Compression types such as GZIP
* @return Compressed Stream
* @throws IOException when some thing bad happens
*/
public static InputStream getCompressedStream(final InputStream sourceStream,
CompressionType type ) throws IOException {
if(sourceStream == null) {
throw new IllegalArgumentException("Source Stream cannot be NULL");
}
/**
* sourceStream --> zipperOutStream(->intermediateStream -)--> resultStream
*/
final PipedInputStream resultStream = new PipedInputStream();
final PipedOutputStream intermediateStream = new PipedOutputStream(resultStream);
final OutputStream zipperOutStream = getCompressionWrapper(intermediateStream, type);
Runnable copyTask = new Runnable() {
@Override
public void run() {
try {
int c;
while((c = sourceStream.read()) >= 0) {
zipperOutStream.write(c);
}
zipperOutStream.flush();
} catch (IOException e) {
IOUtils.closeQuietly(resultStream); // close it on error case only
throw new RuntimeException(e);
} finally {
// close source stream and intermediate streams
IOUtils.closeQuietly(sourceStream);
IOUtils.closeQuietly(zipperOutStream);
IOUtils.closeQuietly(intermediateStream);
}
}
};
getInstance().pool.submit(copyTask);
return resultStream;
}
public static void main(String[] args) throws IOException {
String input = "abcdefghij";
InputStream sourceStream = new ByteArrayInputStream(input.getBytes());
InputStream compressedStream =
getCompressedStream(sourceStream, CompressionType.GZIP);
GZIPInputStream decompressedStream = new GZIPInputStream(compressedStream);
List<String> lines = IOUtils.readLines(decompressedStream);
String output = lines.get(0);
System.out.println("test passed ? " + input.equals(output));
}
}
私は3年遅れたようですが、誰かに役立つかもしれません。私のソリューションは@MichaelWyrazのソリューションに似ていますが、唯一の違いは、私のソリューションがFilterInputStream
に基づいていることです。
import Java.io.ByteArrayInputStream;
import Java.io.FilterInputStream;
import Java.io.IOException;
import Java.io.InputStream;
import Java.util.Zip.CRC32;
import Java.util.Zip.Deflater;
public class GZipInputStreamDeflater extends FilterInputStream {
private static enum Stage {
HEADER,
DATA,
FINALIZATION,
TRAILER,
FINISH
}
private GZipInputStreamDeflater.Stage stage = Stage.HEADER;
private final Deflater deflater = new Deflater( Deflater.DEFLATED, true );
private final CRC32 crc = new CRC32();
/* GZIP header magic number */
private final static int GZIP_MAGIC = 0x8b1f;
private ByteArrayInputStream trailer = null;
private ByteArrayInputStream header = new ByteArrayInputStream( new byte[] {
(byte) GZIP_MAGIC, // Magic number (short)
(byte) ( GZIP_MAGIC >> 8 ), // Magic number (short)
Deflater.DEFLATED, // Compression method (CM)
0, // Flags (FLG)
0, // Modification time MTIME (int)
0, // Modification time MTIME (int)
0, // Modification time MTIME (int)
0, // Modification time MTIME (int)
0, // Extra flags (XFLG)
0, // Operating system (OS)
} );
public GZipInputStreamDeflater(InputStream in) {
super( in );
crc.reset();
}
@Override
public int read( byte[] b, int off, int len ) throws IOException {
int read = -1;
switch( stage ) {
case FINISH:
return -1;
case HEADER:
read = header.read( b, off, len );
if( header.available() == 0 ) {
stage = Stage.DATA;
}
return read;
case DATA:
byte[] b2 = new byte[len];
read = super.read( b2, 0, len );
if( read <= 0 ) {
stage = Stage.FINALIZATION;
deflater.finish();
return 0;
}
else {
deflater.setInput( b2, 0, read );
crc.update( b2, 0, read );
read = 0;
while( !deflater.needsInput() && len - read > 0 ) {
read += deflater.deflate( b, off + read, len - read, Deflater.NO_FLUSH );
}
return read;
}
case FINALIZATION:
if( deflater.finished() ) {
stage = Stage.TRAILER;
int crcVaue = (int) crc.getValue();
int totalIn = deflater.getTotalIn();
trailer = new ByteArrayInputStream( new byte[] {
(byte) ( crcVaue >> 0 ),
(byte) ( crcVaue >> 8 ),
(byte) ( crcVaue >> 16 ),
(byte) ( crcVaue >> 24 ),
(byte) ( totalIn >> 0 ),
(byte) ( totalIn >> 8 ),
(byte) ( totalIn >> 16 ),
(byte) ( totalIn >> 24 ),
} );
return 0;
}
else {
read = deflater.deflate( b, off, len, Deflater.FULL_FLUSH );
return read;
}
case TRAILER:
read = trailer.read( b, off, len );
if( trailer.available() == 0 ) {
stage = Stage.FINISH;
}
return read;
}
return -1;
}
@Override
public void close( ) throws IOException {
super.close();
deflater.end();
if( trailer != null ) {
trailer.close();
}
header.close();
}
}
使用法:
AmazonS3Client s3client = new AmazonS3Client( ... );
try ( InputStream in = new GZipInputStreamDeflater( new URL( "http://....../very-big-file.csv" ).openStream() ); ) {
PutObjectRequest putRequest = new PutObjectRequest( "BUCKET-NAME", "/object/key", in, new ObjectMetadata() );
s3client.putObject( putRequest );
}
データを圧縮するには、GZIPOutputStream
が必要です。ただし、InputStreamからのようにデータを読み戻す必要があるため、OutputStreamをInputStreamに変換する必要があります。 getBytes()を使用してこれを行うことができます。
GZIPOutputStream gout = new GZIPOutputStream(out);
//... Code to read from your original uncompressed data and write to out.
//Convert to InputStream.
new ByteArrayInputStream(gout.getBytes());
ただし、この方法には、最初にすべてのデータを読み込む必要があるという制限があります。つまり、そのバッファーを保持するのに十分なメモリが必要です。
パイプを使用する代替アプローチは、このスレッドで言及されています- OutputStreamをInputStreamに変換する方法?
入力ストリームの圧縮の実用的な例は、人気のあるオープンソースESB Mule : GZIPCompressorInputStream
にあります。
圧縮のためにJREによって提供される DeflaterInputStream
を使用し、 gzip ヘッダーを付加し、gzipトレーラー(別名フッター)を付加します。
残念ながら、それは CPAライセンス の下にありますが、これはあまり一般的ではないようです。また、単体テストはないようです。
JREにはDeflatingGZIPInputStream
はありません。 「deflate」圧縮形式でデフレートするには、Java.util.Zip.DeflaterInputStream
およびJava.util.Zip.DeflaterOutputStream
を使用します。
public InputStream getCompressedStream(InputStream unCompressedStream) {
return new DeflaterInputStream(unCompressedStream);
}
Java.util.Zip.DeflaterInputStream
のソースを調べることで、GZIP形式でデフレートするJava.util.Zip.GZIPOutputStream
からクラスを派生させることができます。
その場合、GZIPOutputStream
を見るべきではありませんか?
public OutputStream getCompressedStream(InputStream input) {
OutputStream output = new GZIPOutputStream(new ByteArrayOutputStream());
IOUtils.copy(input, output);
return output;
}
PipedOutputStream を使用すると、GZIPOutputStreamに書き込み、InputStreamを介してそのデータを公開できます。データのストリーム全体を配列またはファイルにバッファリングする他のソリューションとは異なり、メモリコストは固定されています。唯一の問題は、同じスレッドからの読み取りと書き込みができないことです。別のスレッドを使用する必要があります。
private InputStream gzipInputStream(InputStream in) throws IOException {
PipedInputStream zipped = new PipedInputStream();
PipedOutputStream pipe = new PipedOutputStream(zipped);
new Thread(
() -> {
try(OutputStream zipper = new GZIPOutputStream(pipe)){
IOUtils.copy(in, zipper);
} catch (IOException e) {
e.printStackTrace();
}
}
).start();
return zipped;
}
EasyStream を使用できます。
try(final InputStreamFromOutputStream<Void> isOs = new InputStreamFromOutputStream<Void>() {
@Override
protected void produce(final OutputStream dataSink) throws Exception {
InputStream in = new GZIPInputStream(unCompressedStream);
IOUtils.copy(in, dataSink);
}
}) {
//You can use the compressed input stream here
} catch (final IOException e) {
//Handle exceptions here
}
これは、GZIPOutputStreamに委任するため、CRC/GZIPマジックCookieを含まない私が作成したバージョンです。また、圧縮をバッファリングするのに十分なメモリのみを使用するという点でメモリ効率が高くなります(42MBのファイルは45kのバッファを使用しました)。パフォーマンスは、メモリへの圧縮と同じです。
import Java.io.IOException;
import Java.io.InputStream;
import Java.io.OutputStream;
import Java.util.Zip.GZIPOutputStream;
/**
* Compresses an InputStream in a memory-optimal, on-demand way only compressing enough to fill a buffer.
*
* @author Ben La Monica
*/
public class GZIPCompressingInputStream extends InputStream {
private InputStream in;
private GZIPOutputStream gz;
private OutputStream delegate;
private byte[] buf = new byte[8192];
private byte[] readBuf = new byte[8192];
int read = 0;
int write = 0;
public GZIPCompressingInputStream(InputStream in) throws IOException {
this.in = in;
this.delegate = new OutputStream() {
private void growBufferIfNeeded(int len) {
if ((write + len) >= buf.length) {
// grow the array if we don't have enough space to fulfill the incoming data
byte[] newbuf = new byte[(buf.length + len) * 2];
System.arraycopy(buf, 0, newbuf, 0, buf.length);
buf = newbuf;
}
}
@Override
public void write(byte[] b, int off, int len) throws IOException {
growBufferIfNeeded(len);
System.arraycopy(b, off, buf, write, len);
write += len;
}
@Override
public void write(int b) throws IOException {
growBufferIfNeeded(1);
buf[write++] = (byte) b;
}
};
this.gz = new GZIPOutputStream(delegate);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
compressStream();
int numBytes = Math.min(len, write-read);
if (numBytes > 0) {
System.arraycopy(buf, read, b, off, numBytes);
read += numBytes;
} else if (len > 0) {
// if bytes were requested, but we have none, then we're at the end of the stream
return -1;
}
return numBytes;
}
private void compressStream() throws IOException {
// if the reader has caught up with the writer, then zero the positions out
if (read == write) {
read = 0;
write = 0;
}
while (write == 0) {
// feed the gzip stream data until it spits out a block
int val = in.read(readBuf);
if (val == -1) {
// nothing left to do, we've hit the end of the stream. finalize and break out
gz.close();
break;
} else if (val > 0) {
gz.write(readBuf, 0, val);
}
}
}
@Override
public int read() throws IOException {
compressStream();
if (write == 0) {
// write should not be 0 if we were able to get data from compress stream, must mean we're at the end
return -1;
} else {
// reading a single byte
return buf[read++] & 0xFF;
}
}
}
public InputStream getCompressed( InputStream is ) throws IOException
{
byte data[] = new byte[2048];
ByteArrayOutputStream bos = new ByteArrayOutputStream();
GzipOutputStream zos = new GzipOutputStream( bos );
BufferedInputStream entryStream = new BufferedInputStream( is, 2048);
int count;
while ( ( count = entryStream.read( data, 0, 2048) ) != -1 )
{
zos.write( data, 0, count );
}
entryStream.close();
zos.close();
return new ByteArrayInputStream( bos.toByteArray() );
}
ref: Zip圧縮