coordinates = df0.registerTempTable('coordinates')  #5555 rows
metro_table = df1.registerTempTable('metro_table')  #272 rows
I'm trying to find the closest points to the metro station for that I implement this SQL query but I don't understand why Spark does not accept the INNER JOIN !!! And my second question is how to return the first row only !! I used fetch first 1 rows only but it gives me an indentation error
query = "SELECT uuid,\
            latitude,\
            longitude,\
            p.station_id,\
            p.xlat,\
            p.xlong,\
            p.type_train,\
            p.id_transport,\
              6371000* DEGREES(ACOS(COS(RADIANS(p.xlat))\
                     * COS(RADIANS(latitude))\
                     * COS(RADIANS(p.xlong) - RADIANS(longitude))\
                     + SIN(RADIANS(p.xlat))\
                     * SIN(RADIANS(latitude)))) AS distance_in_meters\
     FROM coordinates\
     CROSS JOIN (\
         SELECT id AS id_transport,\
                station_id,\
                xlat,\
                xlong,\
                type_train\
         FROM metro_table\
         fetch first 1 rows only\   # Doesn't work in Spark
        ) AS p ON 1=1\
     ORDER BY distance_in_meters"
# Run query
df = sqlContext.sql(query)   #1510960 rows
Using Pyspark (fetch first 1 rows only)
w = Window.partitionBy(['uuid', 'latitude', 'longitude']).orderBy('distance_in_meters')
df.select('uuid', 'latitude', 'longitude', xlat, xlong, F.min('distance_in_meters').over(w)).count()  #1510960 rows
Merto
 ---+----------+----------------+----------+----------+
|id |xlong     |xlat            |station_id|type_train|
+---+----------+----------------+----------+----------+
|1  |-73.668172|45.5552769999931|1         |métro     |
|2  |-73.668486|45.5542469999931|2         |métro     |
|3  |-73.668225|45.5556069999931|3         |métro     |
|4  |-73.667407|45.5561219999931|4         |métro     |
+---+----------+----------------+----------+----------+
Coordinates
+-----+---------+----------+
|uuid | latitude| longitude|
+-----+---------+----------+
|1009 | 45.53175| -73.62613|
|1009 | 45.53163| -73.62546|
+-----+---------+----------+
After CROSS JOIN
 +----+--------+---------+----------+----------------+----------+----------+-- ----------------+
 |uuid|latitude|longitude|station_id|            xlat|     xlong|type_train|distance_in_meters|
 +----+--------+---------+----------+----------------+----------+----------+------------------+
 |1009|45.53175|-73.62613|         2|45.5542469999931|-73.668486|     metro|237197.13838255248|
 |1009|45.53163|-73.62546|         2|45.5542469999931|-73.668486|     metro|240044.33000560844|
 |1009|45.53175|-73.62613|         1|45.5552769999931|-73.668172|     metro| 240121.5093484111|
 |1009|45.53175|-73.62613|         4|45.5561219999931|-73.667407|     metro|240897.59082511123|
 |1009|45.53175|-73.62613|         3|45.5556069999931|-73.668225|     metro|241622.85492502493|
 |1009|45.53163|-73.62546|         1|45.5552769999931|-73.668172|     metro|242937.79388593792|
 |1009|45.53163|-73.62546|         4|45.5561219999931|-73.667407|     metro| 243679.8807249287|
 |1009|45.53163|-73.62546|         3|45.5556069999931|-73.668225|     metro| 244431.2963545028|
 +----+--------+---------+----------+----------------+----------+----------+------------------+
Desirable results
+----+--------+---------+----------+----------------+----------+----------+------------------+
|uuid|latitude|longitude|station_id|            xlat|     xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53175|-73.62613|         2|45.5542469999931|-73.668486|     metro|237197.13838255248|
|1009|45.53163|-73.62546|         2|45.5542469999931|-73.668486|     metro|240044.33000560844|

