I was want to create a range of dates on Spark Dataframe, there is no function to do this by default. So, I wrote this,
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.appName('test').getOrCreate()
data_frame = spark.range(1, 10).withColumn('date_start', F.to_date(F.lit('2018-01-01'), 'yyyy-MM-dd'))
The result is
+---+----------+
| id|date_start|
+---+----------+
|  1|2018-01-01|
|  2|2018-01-01|
|  3|2018-01-01|
|  4|2018-01-01|
|  5|2018-01-01|
+---+----------+
Now I want to add the 'date_start' column with 'id' and create a column of dates ranges from start to end.
data_frame.withColumn('date_window', F.date_add(F.col('date_start'), F.col('id')))
But I got the TypeError
TypeError                                 Traceback (most recent call last)
<ipython-input-151-9e46a2ad88a2> in <module>
----> 1 data_frame.withColumn('date_window', F.date_add(F.col('date_start'), F.col('id')))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\sql\functions.py in date_add(start, days)
   1039     """
   1040     sc = SparkContext._active_spark_context
-> 1041     return Column(sc._jvm.functions.date_add(_to_java_column(start), days))
   1042 
   1043 
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
   1246 
   1247     def __call__(self, *args):
-> 1248         args_command, temp_args = self._build_args(*args)
   1249 
   1250         command = proto.CALL_COMMAND_NAME +\
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in _build_args(self, *args)
   1210     def _build_args(self, *args):
   1211         if self.converters is not None and len(self.converters) > 0:
-> 1212             (new_args, temp_args) = self._get_args(args)
   1213         else:
   1214             new_args = args
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in _get_args(self, args)
   1197                 for converter in self.gateway_client.converters:
   1198                     if converter.can_convert(arg):
-> 1199                         temp_arg = converter.convert(arg, self.gateway_client)
   1200                         temp_args.append(temp_arg)
   1201                         new_args.append(temp_arg)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_collections.py in convert(self, object, gateway_client)
    498         ArrayList = JavaClass("java.util.ArrayList", gateway_client)
    499         java_list = ArrayList()
--> 500         for element in object:
    501             java_list.add(element)
    502         return java_list
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\sql\column.py in __iter__(self)
    342 
    343     def __iter__(self):
--> 344         raise TypeError("Column is not iterable")
    345 
    346     # string methods
TypeError: Column is not iterable
For some reason, I could solve this problems using the Spark function expr
data_frame.withColumn("date_window", F.expr("date_add(date_start, id)"))
And voilà! It's seems to work
+---+----------+-----------+
| id|date_start|date_window|
+---+----------+-----------+
|  1|2018-01-01| 2018-01-02|
|  2|2018-01-01| 2018-01-03|
|  3|2018-01-01| 2018-01-04|
|  4|2018-01-01| 2018-01-05|
|  5|2018-01-01| 2018-01-06|
+---+----------+-----------+
My question is: How could the expr function to be different from that function that I wrote? 
 
    