I am working on a deep-learning project where we've to read chest_xray images into a dataframe. SO, our initial plan is to use PySpark but due to many errors, we thought of using Spark, only to read images and then use Keras to fit the model.
This is possible only if we can convert spark dataframe into a pandas dataframe. (If you've any other solution, please feel free to englighten me on this topic)
Btw, we're using hadoop to read in the images using following code into jupyter-notebook:
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
URI = sc._gateway.jvm.java.net.URI
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
fs = FileSystem.get(URI("hdfs://localhost:9000"), sc._jsc.hadoopConfiguration())
for f in fs.listStatus(Path("/data/input/chest_xray")):
    print (f.getPath())
hdfs://localhost:9000/data/input/chest_xray/.DS_Store
hdfs://localhost:9000/data/input/chest_xray/train
hdfs://localhost:9000/data/input/chest_xray/val
df = spark.read.format("image").load("hdfs://localhost:9000/data/input/chest_xray/val/NORMAL")
df.show()
+--------------------+
|               image|
+--------------------+
|[hdfs://localhost...|
|[hdfs://localhost...|
|[hdfs://localhost...|
|[hdfs://localhost...|
|[hdfs://localhost...|
|[hdfs://localhost...|
|[hdfs://localhost...|
|[hdfs://localhost...|
+--------------------+
I've googled that to convert Spark Dataframe into a Pandas DataFrame, we need use spark_df.toPandas() . However, it is throwing me the following error:
Py4JJavaError                             Traceback (most recent call last)
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
Py4JJavaError: An error occurred while calling o53.collectToPython.
: java.lang.IllegalArgumentException: Unsupported class file major version 55
    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:166)
    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:148)
    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:136)
    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:237)
    at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:49)
    at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:517)
    at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:500)
    at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
    at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
    at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
    at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:236)
    at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
    at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:134)
    at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
    at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:500)
    at org.apache.xbean.asm6.ClassReader.readCode(ClassReader.java:2175)
    at org.apache.xbean.asm6.ClassReader.readMethod(ClassReader.java:1238)
    at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:631)
    at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:355)
    at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:307)
    at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:306)
    at scala.collection.immutable.List.foreach(List.scala:392)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:306)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2100)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
    at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
    at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
    at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
    at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
    at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
    at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:566)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.base/java.lang.Thread.run(Thread.java:834)
During handling of the above exception, another exception occurred:
IllegalArgumentException                  Traceback (most recent call last)
<ipython-input-13-af2e220971fb> in <module>
----> 1 df_res = df.toPandas()
/usr/local/spark/python/pyspark/sql/dataframe.py in toPandas(self)
   2141 
   2142         # Below is toPandas without Arrow optimization.
-> 2143         pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
   2144 
   2145         dtype = {}
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
    532         """
    533         with SCCallSiteSync(self._sc) as css:
--> 534             sock_info = self._jdf.collectToPython()
    535         return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
    536 
/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     77                 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
     78             if s.startswith('java.lang.IllegalArgumentException: '):
---> 79                 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
     80             raise
     81     return deco
IllegalArgumentException: 'Unsupported class file major version 55'
I don't understand what is Unsupported class file major version 55. Care to throw some light? Please ask if you need any other specs.
 
    