web-dev-qa-db-ja.com

Delta LakeをAWS S3に書き込む(Databricksなし)

# Creating PySpark Object
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("XMLParser").getOrCreate()
sc=spark.sparkContext
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3n.impl", "org.Apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3n.awsAccessKeyId", aws_key)
hadoop_conf.set("fs.s3n.awsSecretAccessKey", aws_secret)

次に、s3バケットから次のコードを使用してファイルを読み取ることができます

df = spark.read.format("xml").options(rootTag='returnResult', rowTag="query").load("s3n://bucketName/folder/file.xml")

しかし、このコードを使用して、Delta Lake(寄木細工のファイル)を使用してs3に書き戻そうとしたとき

df.write.format("delta").mode('overwrite').save("s3n://bucket/folder/file")

このエラーが発生しています

    Py4JJavaError: An error occurred while calling o778.save.
: Java.io.IOException: The error typically occurs when the default LogStore implementation, that
 is, HDFSLogStore, is used to write into a Delta table on a non-HDFS storage system.
 In order to get the transactional ACID guarantees on table updates, you have to use the
 correct implementation of LogStore that is appropriate for your storage system.
 See https://docs.delta.io/latest/delta-storage.html " for details.

    at org.Apache.spark.sql.delta.DeltaErrors$.incorrectLogStoreImplementationException(DeltaErrors.scala:157)
    at org.Apache.spark.sql.delta.storage.HDFSLogStore.writeInternal(HDFSLogStore.scala:73)
    at org.Apache.spark.sql.delta.storage.HDFSLogStore.write(HDFSLogStore.scala:64)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$$anonfun$doCommit$1.apply$mcJ$sp(OptimisticTransaction.scala:434)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$$anonfun$doCommit$1.apply(OptimisticTransaction.scala:416)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$$anonfun$doCommit$1.apply(OptimisticTransaction.scala:416)
    at org.Apache.spark.sql.delta.DeltaLog.lockInterruptibly(DeltaLog.scala:152)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$class.doCommit(OptimisticTransaction.scala:415)
    at org.Apache.spark.sql.delta.OptimisticTransaction.doCommit(OptimisticTransaction.scala:80)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$$anonfun$commit$1.apply$mcJ$sp(OptimisticTransaction.scala:326)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$$anonfun$commit$1.apply(OptimisticTransaction.scala:284)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$$anonfun$commit$1.apply(OptimisticTransaction.scala:284)
    at com.databricks.spark.util.DatabricksLogging$class.recordOperation(DatabricksLogging.scala:77)
    at org.Apache.spark.sql.delta.OptimisticTransaction.recordOperation(OptimisticTransaction.scala:80)
    at org.Apache.spark.sql.delta.metering.DeltaLogging$class.recordDeltaOperation(DeltaLogging.scala:103)
    at org.Apache.spark.sql.delta.OptimisticTransaction.recordDeltaOperation(OptimisticTransaction.scala:80)
    at org.Apache.spark.sql.delta.OptimisticTransactionImpl$class.commit(OptimisticTransaction.scala:284)
    at org.Apache.spark.sql.delta.OptimisticTransaction.commit(OptimisticTransaction.scala:80)
    at org.Apache.spark.sql.delta.commands.WriteIntoDelta$$anonfun$run$1.apply(WriteIntoDelta.scala:67)
    at org.Apache.spark.sql.delta.commands.WriteIntoDelta$$anonfun$run$1.apply(WriteIntoDelta.scala:64)
    at org.Apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:188)
    at org.Apache.spark.sql.delta.commands.WriteIntoDelta.run(WriteIntoDelta.scala:64)
    at org.Apache.spark.sql.delta.sources.DeltaDataSource.createRelation(DeltaDataSource.scala:134)
    at org.Apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
    at org.Apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
    at org.Apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
    at org.Apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
    at org.Apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
    at org.Apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
    at org.Apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
    at org.Apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.Apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
    at org.Apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
    at org.Apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83)
    at org.Apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81)
    at org.Apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
    at org.Apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
    at org.Apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
    at org.Apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
    at org.Apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
    at org.Apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
    at org.Apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
    at org.Apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
    at org.Apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
    at Sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at Sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.Java:62)
    at Sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.Java:43)
    at Java.lang.reflect.Method.invoke(Method.Java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.Java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.Java:357)
    at py4j.Gateway.invoke(Gateway.Java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.Java:132)
    at py4j.commands.CallCommand.execute(CallCommand.Java:79)
    at py4j.GatewayConnection.run(GatewayConnection.Java:238)
    at Java.lang.Thread.run(Thread.Java:748)
Caused by: org.Apache.hadoop.fs.UnsupportedFileSystemException: fs.AbstractFileSystem.s3n.impl=null: No AbstractFileSystem configured for scheme: s3n
    at org.Apache.hadoop.fs.AbstractFileSystem.createFileSystem(AbstractFileSystem.Java:160)
    at org.Apache.hadoop.fs.AbstractFileSystem.get(AbstractFileSystem.Java:249)
    at org.Apache.hadoop.fs.FileContext$2.run(FileContext.Java:334)
    at org.Apache.hadoop.fs.FileContext$2.run(FileContext.Java:331)
    at Java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.Java:422)
    at org.Apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.Java:1698)
    at org.Apache.hadoop.fs.FileContext.getAbstractFileSystem(FileContext.Java:331)
    at org.Apache.hadoop.fs.FileContext.getFileContext(FileContext.Java:448)
    at org.Apache.spark.sql.delta.storage.HDFSLogStore.getFileContext(HDFSLogStore.scala:47)
    at org.Apache.spark.sql.delta.storage.HDFSLogStore.writeInternal(HDFSLogStore.scala:70)
    ... 53 more

スタックトレースで指定されたリンクをたどろうとしましたが、これを解決する方法を理解できませんでした。どんな助けも添えられるだろう

3
Amit

sparkセッションを作成した後、次のようにs3をデルタストアとして有効にするために、databricksによって提供される構成を追加する必要があります。

conf = spark.sparkContext._conf.setAll([('spark.delta.logStore.class','org.Apache.spark.sql.delta.storage.S3SingleDriverLogStore')])
spark.sparkContext._conf.getAll()

名前が示すように、S3SingleDriverLogStore実装は、すべての同時書き込みが単一のSparkドライバーから発生した場合にのみ正しく機能します。これはアプリケーションプロパティであり、SparkContextを開始する前に設定する必要があり、存続期間中は変更できませんコンテキストの。

Databricksから here にアクセスして、s3aパスアクセスキーと秘密キーを構成してください。

1
Shubham Jain