df = spark.read.csv("data/Log_Reg_dataset.csv", inferSchema=True, header=True)

df.count()

20000

len(df.columns)

6

df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)

df.show(5)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 5 rows

df.summary().show()

+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    25%|    null|               22|                0|    null|                4|                 0|
|    50%|    null|               27|                1|    null|                9|                 0|
|    75%|    null|               34|                1|    null|               14|                 1|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+

df.groupBy('Platform').count().show()

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|    Bing| 4360|
|  Google| 5781|
+--------+-----+

df.groupBy('Country').count().show()

+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+

df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1|10000|
|     0|10000|
+------+-----+

df.groupBy('Country').mean().show()

+---------+------------------+-------------------+---------------------+--------------------+
|  Country|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|         avg(Status)|
+---------+------------------+-------------------+---------------------+--------------------+
| Malaysia|27.792282430213465| 0.5730706075533661|   11.192118226600986|  0.6568144499178982|
|    India|27.976854156296664| 0.5433051269288203|   10.727227476356397|  0.6212045793927327|
|Indonesia| 28.43159796354081| 0.5207751683363442|    9.985711939563148|  0.5422893742814913|
|   Brazil|30.274168600154677|  0.322892498066512|    4.921113689095128|0.038669760247486466|
+---------+------------------+-------------------+---------------------+--------------------+

df.groupBy('Platform').mean().show()

+--------+------------------+-------------------+---------------------+------------------+
|Platform|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|       avg(Status)|
+--------+------------------+-------------------+---------------------+------------------+
|   Yahoo|28.569226087838523| 0.5094837204584644|    9.599655137437875|0.5071508266558474|
|    Bing| 28.68394495412844| 0.4720183486238532|    9.114908256880733|0.4559633027522936|
|  Google|28.380038055699707| 0.5149628092025601|    9.804878048780488|0.5210171250648676|
+--------+------------------+-------------------+---------------------+------------------+

df.groupBy('Status').mean().show()

+------+--------+-------------------+---------------------+-----------+
|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|
+------+--------+-------------------+---------------------+-----------+
|     1| 26.5435|             0.7019|              14.5617|        1.0|
|     0| 30.5356|             0.3039|               4.5449|        0.0|
+------+--------+-------------------+---------------------+-----------+

from pyspark.ml.feature import StringIndexer, OneHotEncoder

platform_indexer = StringIndexer(inputCol='Platform', \
                                 outputCol='Platform_Num').fit(df)

df = platform_indexer.transform(df)
df.show(5)

+---------+---+--------------+--------+----------------+------+------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|
+---------+---+--------------+--------+----------------+------+------------+
|    India| 41|             1|   Yahoo|              21|     1|         0.0|
|   Brazil| 28|             1|   Yahoo|               5|     0|         0.0|
|   Brazil| 40|             0|  Google|               3|     0|         1.0|
|Indonesia| 31|             1|    Bing|              15|     1|         2.0|
| Malaysia| 32|             0|  Google|              15|     1|         1.0|
+---------+---+--------------+--------+----------------+------+------------+
only showing top 5 rows

platform_encoder = OneHotEncoder(inputCol='Platform_Num', outputCol='Platform_Vector')

df = platform_encoder.transform(df)
df.show(5)

+---------+---+--------------+--------+----------------+------+------------+---------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|Platform_Vector|
+---------+---+--------------+--------+----------------+------+------------+---------------+
|    India| 41|             1|   Yahoo|              21|     1|         0.0|  (2,[0],[1.0])|
|   Brazil| 28|             1|   Yahoo|               5|     0|         0.0|  (2,[0],[1.0])|
|   Brazil| 40|             0|  Google|               3|     0|         1.0|  (2,[1],[1.0])|
|Indonesia| 31|             1|    Bing|              15|     1|         2.0|      (2,[],[])|
| Malaysia| 32|             0|  Google|              15|     1|         1.0|  (2,[1],[1.0])|
+---------+---+--------------+--------+----------------+------+------------+---------------+
only showing top 5 rows

df.groupBy('Platform').count().orderBy('count', ascending=False).show(5)

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|  Google| 5781|
|    Bing| 4360|
+--------+-----+

df.groupBy('Platform_Num').count().orderBy('count', ascending=False).show(5)

+------------+-----+
|Platform_Num|count|
+------------+-----+
|         0.0| 9859|
|         1.0| 5781|
|         2.0| 4360|
+------------+-----+

df.groupBy('Platform_Vector').count().orderBy('count', ascending=False).show(5)

+---------------+-----+
|Platform_Vector|count|
+---------------+-----+
|  (2,[0],[1.0])| 9859|
|  (2,[1],[1.0])| 5781|
|      (2,[],[])| 4360|
+---------------+-----+

country_indexer = StringIndexer(inputCol='Country', \
                                outputCol='Country_Num').fit(df)
df = country_indexer.transform(df)

country_encoder =  OneHotEncoder(inputCol='Country_Num', \
                                 outputCol='Country_Vector')
df = country_encoder.transform(df)

df.select(['Country', 'Country_Num', 'Country_Vector']).show(5)

+---------+-----------+--------------+
|  Country|Country_Num|Country_Vector|
+---------+-----------+--------------+
|    India|        1.0| (3,[1],[1.0])|
|   Brazil|        2.0| (3,[2],[1.0])|
|   Brazil|        2.0| (3,[2],[1.0])|
|Indonesia|        0.0| (3,[0],[1.0])|
| Malaysia|        3.0|     (3,[],[])|
+---------+-----------+--------------+
only showing top 5 rows

from pyspark.ml.feature import VectorAssembler

df_assembler = VectorAssembler(inputCols=['Platform_Vector', 'Country_Vector', 'Age', \
                                          'Repeat_Visitor','Web_pages_viewed'], \
                               outputCol='features')
df = df_assembler.transform(df)

df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- Platform_Num: double (nullable = false)
 |-- Platform_Vector: vector (nullable = true)
 |-- Country_Num: double (nullable = false)
 |-- Country_Vector: vector (nullable = true)
 |-- features: vector (nullable = true)

df.select(['features', 'Status']).show(5)

+--------------------+------+
|            features|Status|
+--------------------+------+
|[1.0,0.0,0.0,1.0,...|     1|
|[1.0,0.0,0.0,0.0,...|     0|
|(8,[1,4,5,7],[1.0...|     0|
|(8,[2,5,6,7],[1.0...|     1|
|(8,[1,5,7],[1.0,3...|     1|
+--------------------+------+
only showing top 5 rows

model_df = df.select(['features', 'Status'])

from pyspark.ml.classification import LogisticRegression

train_df, test_df = model_df.randomSplit([0.7, 0.3])

train_df.count()

13842

test_df.count()

6158

log_reg_model = LogisticRegression(labelCol='Status')
log_reg_model_fit = log_reg_model.fit(train_df)

log_reg_model_fit.coefficients

DenseVector([0.1943, 0.2312, -0.4285, -0.1042, -3.7871, -0.0679, 1.7362, 0.7386])

log_reg_model_fit.intercept

-5.19530354724406

train_result = log_reg_model_fit.evaluate(train_df).predictions

correct_preds = train_result.filter(train_result['Status']==1) \
                            .filter(train_result['prediction']==1).count()

train_df.filter(train_df['Status']==1).count()

6905

correct_preds

6476

correct_preds/train_df.filter(train_df['Status']==1).count()

0.9378711078928312

from pyspark.ml.evaluation import BinaryClassificationEvaluator

results = log_reg_model_fit.evaluate(test_df).predictions

results.select(['Status', 'prediction']).show(20)

+------+----------+
|Status|prediction|
+------+----------+
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     1|       0.0|
|     1|       0.0|
|     0|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
+------+----------+
only showing top 20 rows

results[(results.Status == 1) & (results.prediction == 1)].count()

2906

results[(results.Status == 1) & (results.prediction == 0)].count()

189

results[(results.Status == 0) & (results.prediction == 0)].count()

2881

results[(results.Status == 0) & (results.prediction == 1)].count()

182

Spark ML 05(Pyspark) (0)	2020.05.04
Spark ML 04(Pyspark) (0)	2020.05.03
Spark ML 02 (Pyspark) (0)	2020.04.26
Spark ML (Pyspark) (0)	2020.04.25
Spark Streaming (PySpark) (0)	2020.04.21

Hee'World

Hee'World

Spark ML 03 (Pyspark) 본문

Spark ML 03 (Pyspark)

'BigData > Spark' 카테고리의 다른 글

티스토리툴바

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30