I'm new to PySpark. I was playing around with the tfidf. Just wanted to check if they are giving same results. But they are not same. Here's what I did.
# create the PySpark dataframe
sentenceData = sqlContext.createDataFrame((
  (0.0, "Hi I heard about Spark"),
  (0.0, "I wish Java could use case classes"),
  (1.0, "Logistic regression models are neat")
)).toDF("label", "sentence")
# tokenize
tokenizer = Tokenizer().setInputCol("sentence").setOutputCol("words")
wordsData = tokenizer.transform(sentenceData)
# vectorize
vectorizer = CountVectorizer(inputCol='words', outputCol='vectorizer').fit(wordsData)
wordsData = vectorizer.transform(wordsData)
# calculate scores
idf = IDF(inputCol="vectorizer", outputCol="tfidf_features")
idf_model = idf.fit(wordsData)
wordsData = idf_model.transform(wordsData)
# dense the current response variable
def to_dense(in_vec):
    return DenseVector(in_vec.toArray())
to_dense_udf = udf(lambda x: to_dense(x), VectorUDT())
# create dense vector
wordsData = wordsData.withColumn("tfidf_features_dense", to_dense_udf('tfidf_features'))
I converted the PySpark df to pandas
wordsData_pandas = wordsData.toPandas()
and, then just calculating using sklearn's tfidf as following
def dummy_fun(doc):
    return doc
# create sklearn tfidf
tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  
# transform and get idf scores
feature_matrix = tfidf.fit_transform(wordsData_pandas.words)
# create sklearn dtm matrix
sklearn_tfifdf = pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())
# create PySpark dtm matrix
spark_tfidf = pd.DataFrame([np.array(i) for i in wordsData_pandas.tfidf_features_dense], columns=vectorizer.vocabulary)
But unfortunately, I'm getting this for PySpark
<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>i</th>      <th>are</th>      <th>logistic</th>      <th>case</th>      <th>spark</th>      <th>hi</th>      <th>about</th>      <th>neat</th>      <th>could</th>      <th>regression</th>      <th>wish</th>      <th>use</th>      <th>heard</th>      <th>classes</th>      <th>java</th>      <th>models</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>0.287682</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.693147</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>    </tr>    <tr>      <th>1</th>      <td>0.287682</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.693147</td>      <td>0.000000</td>    </tr>    <tr>      <th>2</th>      <td>0.000000</td>      <td>0.693147</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.693147</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.693147</td>    </tr>  </tbody></table>
and this for sklearn,
<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>i</th>      <th>are</th>      <th>logistic</th>      <th>case</th>      <th>spark</th>      <th>hi</th>      <th>about</th>      <th>neat</th>      <th>could</th>      <th>regression</th>      <th>wish</th>      <th>use</th>      <th>heard</th>      <th>classes</th>      <th>java</th>      <th>models</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>0.355432</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.467351</td>      <td>0.467351</td>      <td>0.467351</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.467351</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>    </tr>    <tr>      <th>1</th>      <td>0.296520</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.389888</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.389888</td>      <td>0.000000</td>      <td>0.389888</td>      <td>0.389888</td>      <td>0.000000</td>      <td>0.389888</td>      <td>0.389888</td>      <td>0.000000</td>    </tr>    <tr>      <th>2</th>      <td>0.000000</td>      <td>0.447214</td>      <td>0.447214</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.447214</td>      <td>0.000000</td>      <td>0.447214</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.000000</td>      <td>0.447214</td>    </tr>  </tbody></table>
I did try out the use_idf, smooth_idf paramters. But none seem to make both same. What am I missing? Any help is appreciated. Thanks in advance.



