Skip to content

Commit ba71a29

Browse files
committed
Created using Colab
1 parent 9c08dd2 commit ba71a29

File tree

1 file changed

+130
-1
lines changed

1 file changed

+130
-1
lines changed

PySpark.ipynb

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"metadata": {
55
"colab": {
66
"provenance": [],
7-
"authorship_tag": "ABX9TyNxAkUJklJhGRAUU039mAKM",
7+
"authorship_tag": "ABX9TyO+U61SsUd5D9aYKQLGAMPF",
88
"include_colab_link": true
99
},
1010
"kernelspec": {
@@ -353,6 +353,135 @@
353353
]
354354
}
355355
]
356+
},
357+
{
358+
"cell_type": "markdown",
359+
"source": [
360+
"#Featured Engineering"
361+
],
362+
"metadata": {
363+
"id": "Cb2BpKZUuy6Q"
364+
}
365+
},
366+
{
367+
"cell_type": "code",
368+
"source": [
369+
"from pyspark.sql.functions import dayofmonth, month, year, lag\n",
370+
"from pyspark.sql.window import Window\n",
371+
"\n",
372+
"#Window\n",
373+
"wind_spec = Window.orderBy('Order_Date')\n",
374+
"\n",
375+
"#Add Lag (previous day sales)\n",
376+
"daily_sales = daily_sales.withColumn('Prev_Day_Sales', lag('Daily_Sales').over(wind_spec))\n",
377+
"\n",
378+
"daily_sales = daily_sales.withColumn('Day', dayofmonth(col('Order_Date')))\n",
379+
"daily_sales = daily_sales.withColumn('Month', month(col('Order_Date')))\n",
380+
"daily_sales = daily_sales.withColumn('Year', year(col('Order_Date')))\n",
381+
"\n",
382+
"#Drop NA values\n",
383+
"daily_sales = daily_sales.na.drop()\n",
384+
"\n",
385+
"daily_sales.show(10)"
386+
],
387+
"metadata": {
388+
"id": "_TvbaBNVu50q",
389+
"outputId": "6440540e-b667-46c3-9179-f185a70040e2",
390+
"colab": {
391+
"base_uri": "https://localhost:8080/"
392+
}
393+
},
394+
"execution_count": 30,
395+
"outputs": [
396+
{
397+
"output_type": "stream",
398+
"name": "stdout",
399+
"text": [
400+
"+----------+-----------------+-----------------+---+-----+----+\n",
401+
"|Order_Date| Daily_Sales| Prev_Day_Sales|Day|Month|Year|\n",
402+
"+----------+-----------------+-----------------+---+-----+----+\n",
403+
"|2021-01-05| 19.536| 288.06| 5| 1|2021|\n",
404+
"|2021-01-06| 4407.1| 19.536| 6| 1|2021|\n",
405+
"|2021-01-07|87.15799999999999| 4407.1| 7| 1|2021|\n",
406+
"|2021-01-09| 40.544|87.15799999999999| 9| 1|2021|\n",
407+
"|2021-01-10| 54.83| 40.544| 10| 1|2021|\n",
408+
"|2021-01-11| 9.94| 54.83| 11| 1|2021|\n",
409+
"|2021-01-13| 3553.795| 9.94| 13| 1|2021|\n",
410+
"|2021-01-14| 61.96| 3553.795| 14| 1|2021|\n",
411+
"|2021-01-15| 149.95| 61.96| 15| 1|2021|\n",
412+
"|2021-01-16| 299.964| 149.95| 16| 1|2021|\n",
413+
"+----------+-----------------+-----------------+---+-----+----+\n",
414+
"only showing top 10 rows\n",
415+
"\n"
416+
]
417+
}
418+
]
419+
},
420+
{
421+
"cell_type": "markdown",
422+
"source": [
423+
"#Training and Testing"
424+
],
425+
"metadata": {
426+
"id": "gvQchto0yyZX"
427+
}
428+
},
429+
{
430+
"cell_type": "code",
431+
"source": [
432+
"from pyspark.ml.feature import VectorAssembler\n",
433+
"\n",
434+
"#Assemble the fetures into singal vector\n",
435+
"feature_col = ['Day', 'Month', 'Year', 'Prev_Day_Sales']\n",
436+
"assembler = VectorAssembler(inputCols = feature_col, outputCol = 'features')\n",
437+
"\n",
438+
"daily_sales_update = assembler.transform(daily_sales).select('features', 'Daily_Sales')\n",
439+
"\n",
440+
"daily_sales_update.show(10)"
441+
],
442+
"metadata": {
443+
"id": "wSLIMTLjy3pl",
444+
"outputId": "ec1d4068-8faf-4097-b49b-3979149cc836",
445+
"colab": {
446+
"base_uri": "https://localhost:8080/"
447+
}
448+
},
449+
"execution_count": 31,
450+
"outputs": [
451+
{
452+
"output_type": "stream",
453+
"name": "stdout",
454+
"text": [
455+
"+--------------------+-----------------+\n",
456+
"| features| Daily_Sales|\n",
457+
"+--------------------+-----------------+\n",
458+
"|[5.0,1.0,2021.0,2...| 19.536|\n",
459+
"|[6.0,1.0,2021.0,1...| 4407.1|\n",
460+
"|[7.0,1.0,2021.0,4...|87.15799999999999|\n",
461+
"|[9.0,1.0,2021.0,8...| 40.544|\n",
462+
"|[10.0,1.0,2021.0,...| 54.83|\n",
463+
"|[11.0,1.0,2021.0,...| 9.94|\n",
464+
"|[13.0,1.0,2021.0,...| 3553.795|\n",
465+
"|[14.0,1.0,2021.0,...| 61.96|\n",
466+
"|[15.0,1.0,2021.0,...| 149.95|\n",
467+
"|[16.0,1.0,2021.0,...| 299.964|\n",
468+
"+--------------------+-----------------+\n",
469+
"only showing top 10 rows\n",
470+
"\n"
471+
]
472+
}
473+
]
474+
},
475+
{
476+
"cell_type": "code",
477+
"source": [
478+
"train, test = daily_sales_update.randomSplit([0.8, 0.2], seed= 42)"
479+
],
480+
"metadata": {
481+
"id": "cVHQJ5Er06Pl"
482+
},
483+
"execution_count": 32,
484+
"outputs": []
356485
}
357486
]
358487
}

0 commit comments

Comments
 (0)