4
4
"metadata" : {
5
5
"colab" : {
6
6
"provenance" : [],
7
- "authorship_tag" : " ABX9TyNxAkUJklJhGRAUU039mAKM " ,
7
+ "authorship_tag" : " ABX9TyO+U61SsUd5D9aYKQLGAMPF " ,
8
8
"include_colab_link" : true
9
9
},
10
10
"kernelspec" : {
353
353
]
354
354
}
355
355
]
356
+ },
357
+ {
358
+ "cell_type" : " markdown" ,
359
+ "source" : [
360
+ " #Featured Engineering"
361
+ ],
362
+ "metadata" : {
363
+ "id" : " Cb2BpKZUuy6Q"
364
+ }
365
+ },
366
+ {
367
+ "cell_type" : " code" ,
368
+ "source" : [
369
+ " from pyspark.sql.functions import dayofmonth, month, year, lag\n " ,
370
+ " from pyspark.sql.window import Window\n " ,
371
+ " \n " ,
372
+ " #Window\n " ,
373
+ " wind_spec = Window.orderBy('Order_Date')\n " ,
374
+ " \n " ,
375
+ " #Add Lag (previous day sales)\n " ,
376
+ " daily_sales = daily_sales.withColumn('Prev_Day_Sales', lag('Daily_Sales').over(wind_spec))\n " ,
377
+ " \n " ,
378
+ " daily_sales = daily_sales.withColumn('Day', dayofmonth(col('Order_Date')))\n " ,
379
+ " daily_sales = daily_sales.withColumn('Month', month(col('Order_Date')))\n " ,
380
+ " daily_sales = daily_sales.withColumn('Year', year(col('Order_Date')))\n " ,
381
+ " \n " ,
382
+ " #Drop NA values\n " ,
383
+ " daily_sales = daily_sales.na.drop()\n " ,
384
+ " \n " ,
385
+ " daily_sales.show(10)"
386
+ ],
387
+ "metadata" : {
388
+ "id" : " _TvbaBNVu50q" ,
389
+ "outputId" : " 6440540e-b667-46c3-9179-f185a70040e2" ,
390
+ "colab" : {
391
+ "base_uri" : " https://localhost:8080/"
392
+ }
393
+ },
394
+ "execution_count" : 30 ,
395
+ "outputs" : [
396
+ {
397
+ "output_type" : " stream" ,
398
+ "name" : " stdout" ,
399
+ "text" : [
400
+ " +----------+-----------------+-----------------+---+-----+----+\n " ,
401
+ " |Order_Date| Daily_Sales| Prev_Day_Sales|Day|Month|Year|\n " ,
402
+ " +----------+-----------------+-----------------+---+-----+----+\n " ,
403
+ " |2021-01-05| 19.536| 288.06| 5| 1|2021|\n " ,
404
+ " |2021-01-06| 4407.1| 19.536| 6| 1|2021|\n " ,
405
+ " |2021-01-07|87.15799999999999| 4407.1| 7| 1|2021|\n " ,
406
+ " |2021-01-09| 40.544|87.15799999999999| 9| 1|2021|\n " ,
407
+ " |2021-01-10| 54.83| 40.544| 10| 1|2021|\n " ,
408
+ " |2021-01-11| 9.94| 54.83| 11| 1|2021|\n " ,
409
+ " |2021-01-13| 3553.795| 9.94| 13| 1|2021|\n " ,
410
+ " |2021-01-14| 61.96| 3553.795| 14| 1|2021|\n " ,
411
+ " |2021-01-15| 149.95| 61.96| 15| 1|2021|\n " ,
412
+ " |2021-01-16| 299.964| 149.95| 16| 1|2021|\n " ,
413
+ " +----------+-----------------+-----------------+---+-----+----+\n " ,
414
+ " only showing top 10 rows\n " ,
415
+ " \n "
416
+ ]
417
+ }
418
+ ]
419
+ },
420
+ {
421
+ "cell_type" : " markdown" ,
422
+ "source" : [
423
+ " #Training and Testing"
424
+ ],
425
+ "metadata" : {
426
+ "id" : " gvQchto0yyZX"
427
+ }
428
+ },
429
+ {
430
+ "cell_type" : " code" ,
431
+ "source" : [
432
+ " from pyspark.ml.feature import VectorAssembler\n " ,
433
+ " \n " ,
434
+ " #Assemble the fetures into singal vector\n " ,
435
+ " feature_col = ['Day', 'Month', 'Year', 'Prev_Day_Sales']\n " ,
436
+ " assembler = VectorAssembler(inputCols = feature_col, outputCol = 'features')\n " ,
437
+ " \n " ,
438
+ " daily_sales_update = assembler.transform(daily_sales).select('features', 'Daily_Sales')\n " ,
439
+ " \n " ,
440
+ " daily_sales_update.show(10)"
441
+ ],
442
+ "metadata" : {
443
+ "id" : " wSLIMTLjy3pl" ,
444
+ "outputId" : " ec1d4068-8faf-4097-b49b-3979149cc836" ,
445
+ "colab" : {
446
+ "base_uri" : " https://localhost:8080/"
447
+ }
448
+ },
449
+ "execution_count" : 31 ,
450
+ "outputs" : [
451
+ {
452
+ "output_type" : " stream" ,
453
+ "name" : " stdout" ,
454
+ "text" : [
455
+ " +--------------------+-----------------+\n " ,
456
+ " | features| Daily_Sales|\n " ,
457
+ " +--------------------+-----------------+\n " ,
458
+ " |[5.0,1.0,2021.0,2...| 19.536|\n " ,
459
+ " |[6.0,1.0,2021.0,1...| 4407.1|\n " ,
460
+ " |[7.0,1.0,2021.0,4...|87.15799999999999|\n " ,
461
+ " |[9.0,1.0,2021.0,8...| 40.544|\n " ,
462
+ " |[10.0,1.0,2021.0,...| 54.83|\n " ,
463
+ " |[11.0,1.0,2021.0,...| 9.94|\n " ,
464
+ " |[13.0,1.0,2021.0,...| 3553.795|\n " ,
465
+ " |[14.0,1.0,2021.0,...| 61.96|\n " ,
466
+ " |[15.0,1.0,2021.0,...| 149.95|\n " ,
467
+ " |[16.0,1.0,2021.0,...| 299.964|\n " ,
468
+ " +--------------------+-----------------+\n " ,
469
+ " only showing top 10 rows\n " ,
470
+ " \n "
471
+ ]
472
+ }
473
+ ]
474
+ },
475
+ {
476
+ "cell_type" : " code" ,
477
+ "source" : [
478
+ " train, test = daily_sales_update.randomSplit([0.8, 0.2], seed= 42)"
479
+ ],
480
+ "metadata" : {
481
+ "id" : " cVHQJ5Er06Pl"
482
+ },
483
+ "execution_count" : 32 ,
484
+ "outputs" : []
356
485
}
357
486
]
358
487
}
0 commit comments