I need to iterate over data frame in specific order and apply some complex logic to calculate new column.
Also my strong preference is to do it in generic way so I do not have to list all columns of a row and do df.as[my_record] or case Row(...) => as shown here. Instead, I want to access row columns by their names and just add result column(s) to source row.
Below approach works just fine but I'd like to avoid specifying schema twice: first time so that I can access columns by name while iterating and second time to process output.
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
val q = """
select 2 part, 1 id
union all select 2 part, 4 id
union all select 2 part, 3 id
union all select 2 part, 2 id
"""
val df = spark.sql(q)
def f_row(iter: Iterator[Row]) : Iterator[Row] = {
  if (iter.hasNext) {
    def complex_logic(p: Int): Integer = if (p == 3) null else p * 10;
    val head = iter.next
    val schema = StructType(head.schema.fields :+ StructField("result", IntegerType))
    val r =
      new GenericRowWithSchema((head.toSeq :+ complex_logic(head.getAs("id"))).toArray, schema)
    iter.scanLeft(r)((r1, r2) =>
      new GenericRowWithSchema((r2.toSeq :+ complex_logic(r2.getAs("id"))).toArray, schema)
    )
  } else iter
}
val schema = StructType(df.schema.fields :+ StructField("result", IntegerType))
val encoder = RowEncoder(schema)
df.repartition($"part").sortWithinPartitions($"id").mapPartitions(f_row)(encoder).show
What information is lost after applying mapPartitions so output cannot be processed without explicit encoder? How to avoid specifying it?
 
     
    