diff --git a/DAKD2020_ex3_Elias_Ervela.ipynb b/DAKD2020_ex3_Elias_Ervela.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..c7884966ec90219bd4477c56a406634b1d054f53
--- /dev/null
+++ b/DAKD2020_ex3_Elias_Ervela.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"DAKD2020_ex3_Elias_Ervela.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"toc":{"base_numbering":1,"nav_menu":{"height":"180px","width":"160px"},"number_sections":true,"sideBar":true,"skip_h1_title":true,"title_cell":"Table of Contents","title_sidebar":"Contents","toc_cell":false,"toc_position":{},"toc_section_display":true,"toc_window_display":false}},"cells":[{"cell_type":"markdown","metadata":{"id":"o8Yz6wcFl0SZ"},"source":["Elias Ervelä <br>\n","student number 518434 <br>\n","emerve@utu.fi  <br>\n","December, 4, 2020  <br>"]},{"cell_type":"markdown","metadata":{"id":"rMMbxwZ7l0SZ"},"source":["# Data Analysis and Knowledge Discovery: Exercise 3, Supervised learning"]},{"cell_type":"markdown","metadata":{"id":"ND3O3s6Wl0SZ"},"source":["This is the template for the third exercise. The idea of this exercise is to apply supervised learning to predict the ship type using certain attributes (speed, destination harbour...) and K nearest neighbors (kNN) classifier. The data is available in Moodle course page: shipdata_2020.xlsx. <br> \n","\n","General guidance for exercises is given in Moodle course page. <br>\n","\n"," - answer all the questions below\n"," - write easily readable code, include explanations what your code does\n"," - make informative illustrations: include labels for x- and y-axes, legends and captions for your plots\n"," - do not change anything manually or outside the script in the data file\n"," - before saving the ipynb file (and possible printing) run: \"Restart & Run all\", to make sure you return a file that works as expected\n"," - name your file as DAKD2020_ex3_firstname_lastname.ipynb\n"," - +1 bonus point requires a correct solution and also thorough analysis. Discuss also how the results could be improved\n"," - if you encounter problems, Google first. If you can't find an answer to the problem, don't hesitate to ask in the Moodle discussion or directly: pekavir@utu.fi\n"," - Note! Don't leave it to the last moment! No feedback service during the weekend\n"," - The deadline is **Friday 4th of December 23:59**"]},{"cell_type":"markdown","metadata":{"id":"tjQo2BLJl0SZ"},"source":["## Data import"]},{"cell_type":"markdown","metadata":{"id":"Hkfm5UhCl0SZ"},"source":["Gather *all* packages needed for this notebook here:"]},{"cell_type":"code","metadata":{"id":"_HNy2B6Hl0SZ"},"source":["import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"FFHDDGrIl0Sa"},"source":["Import the data."]},{"cell_type":"markdown","metadata":{"id":"E0T5KP-ACagm"},"source":["Lets import the data from my google drive.\n","I used this as a help: https://buomsoo-kim.github.io/colab/2018/04/16/Importing-files-from-Google-Drive-in-Google-Colab.md/\n"]},{"cell_type":"code","metadata":{"id":"iaEpSNSw_IoA"},"source":["from pydrive.auth import GoogleAuth\n","from pydrive.drive import GoogleDrive\n","from google.colab import auth\n","from oauth2client.client import GoogleCredentials"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"74B_BhOPAb2a"},"source":["auth.authenticate_user()\n","gauth = GoogleAuth()\n","gauth.credentials = GoogleCredentials.get_application_default()\n","drive = GoogleDrive(gauth)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"50oqP3boAm6b"},"source":["downloaded = drive.CreateFile({'id':\"1Pkdj3ZSe_ipq31Z7EY2Lpgwdb7oebEjl\"})   # replace the id with id of file you want to access\n","downloaded.GetContentFile('shipdata_2020.xlsx')        # replace the file name with your file"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Zo4R468vA0cs"},"source":["data = pd.read_excel('shipdata_2020.xlsx')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":419},"id":"XP3Pv_M_C2rx","executionInfo":{"status":"ok","timestamp":1607101253685,"user_tz":-120,"elapsed":2176,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"b2b11437-4802-4ae8-9707-dc6bc358a7a7"},"source":["data"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>MMSI</th>\n","      <th>Speed</th>\n","      <th>COG</th>\n","      <th>Destination</th>\n","      <th>Ship_type</th>\n","      <th>Gross_tonnage</th>\n","      <th>Length</th>\n","      <th>Breadth</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>212209000</td>\n","      <td>10.1377</td>\n","      <td>64.3074</td>\n","      <td>Hamina</td>\n","      <td>Cargo</td>\n","      <td>3416</td>\n","      <td>94.91</td>\n","      <td>15.34</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>212436000</td>\n","      <td>13.5256</td>\n","      <td>77.0755</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>6280</td>\n","      <td>116.90</td>\n","      <td>18.00</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>219082000</td>\n","      <td>9.9416</td>\n","      <td>74.6762</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>9980</td>\n","      <td>141.20</td>\n","      <td>21.90</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>219083000</td>\n","      <td>11.6038</td>\n","      <td>74.7529</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>9980</td>\n","      <td>141.20</td>\n","      <td>21.60</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>219426000</td>\n","      <td>11.9203</td>\n","      <td>56.3253</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>3219</td>\n","      <td>99.90</td>\n","      <td>15.00</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>129</th>\n","      <td>273374820</td>\n","      <td>10.0396</td>\n","      <td>74.6253</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>4979</td>\n","      <td>139.90</td>\n","      <td>16.70</td>\n","    </tr>\n","    <tr>\n","      <th>130</th>\n","      <td>273385070</td>\n","      <td>9.3507</td>\n","      <td>74.5454</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>4979</td>\n","      <td>139.90</td>\n","      <td>16.94</td>\n","    </tr>\n","    <tr>\n","      <th>131</th>\n","      <td>273388150</td>\n","      <td>9.7668</td>\n","      <td>68.7159</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>5075</td>\n","      <td>140.85</td>\n","      <td>16.86</td>\n","    </tr>\n","    <tr>\n","      <th>132</th>\n","      <td>636092755</td>\n","      <td>11.1554</td>\n","      <td>73.7013</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>23240</td>\n","      <td>183.00</td>\n","      <td>27.37</td>\n","    </tr>\n","    <tr>\n","      <th>133</th>\n","      <td>357100000</td>\n","      <td>11.2703</td>\n","      <td>59.3888</td>\n","      <td>Vysotsk</td>\n","      <td>Cargo</td>\n","      <td>43717</td>\n","      <td>229.04</td>\n","      <td>32.31</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>134 rows × 8 columns</p>\n","</div>"],"text/plain":["          MMSI    Speed      COG  ... Gross_tonnage  Length  Breadth\n","0    212209000  10.1377  64.3074  ...          3416   94.91    15.34\n","1    212436000  13.5256  77.0755  ...          6280  116.90    18.00\n","2    219082000   9.9416  74.6762  ...          9980  141.20    21.90\n","3    219083000  11.6038  74.7529  ...          9980  141.20    21.60\n","4    219426000  11.9203  56.3253  ...          3219   99.90    15.00\n","..         ...      ...      ...  ...           ...     ...      ...\n","129  273374820  10.0396  74.6253  ...          4979  139.90    16.70\n","130  273385070   9.3507  74.5454  ...          4979  139.90    16.94\n","131  273388150   9.7668  68.7159  ...          5075  140.85    16.86\n","132  636092755  11.1554  73.7013  ...         23240  183.00    27.37\n","133  357100000  11.2703  59.3888  ...         43717  229.04    32.31\n","\n","[134 rows x 8 columns]"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"4RT-aufBl0Sa"},"source":["## Data preprocessing"]},{"cell_type":"markdown","metadata":{"id":"xYkpfGTul0Sa"},"source":[" - First, find out how many different destinations there are in the data. Do you need to make any preprocessing? **1p**\n"," - Destination harbor is a categorical variable. It needs to be converted into numerical. Explain, why do you need to make this step? You can use get_dummies from pandas to implement onehot coding for categorical features **1p**\n"," - Plot Gross tonnage versus the ship Length. Use different colors for different ship types. According to the plot, there is one clear outlier. Find the correct value from marinetraffic.com, and make the correction **1p**\n"," - It is good to exploit domain knowledge and make some reasonable transformation to the feature values to improve the expected results and/or to avoid redundancy. Find out what gross tonnage means. Make some transformation to Length values to acquire a linear relationship between the transformed length and Gross tonnage values **1p**\n"," - The numerical variables have quite different ranges. To ensure that all variables can have the same importance on the model, perform Z-score standardization. Perform it for speed, transformed length, and breadth **1p**"]},{"cell_type":"markdown","metadata":{"id":"KF1p6H4uHvoS"},"source":["**First, find out how many different destinations there are in the data. Do you need to make any preprocessing? 1p**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"G1OaInNuDRDs","executionInfo":{"status":"ok","timestamp":1607101253686,"user_tz":-120,"elapsed":2170,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"5c7794c0-d083-417e-fb9e-d918bf7d2d26"},"source":["data['Destination'].unique()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['Hamina', 'Helsinki', 'Kotka', 'Kronshtadt', 'Kunda', 'Muuga',\n","       'Paldiski', 'Porvoo', 'Primorsk', 'Sillamäe', 'Sillamae',\n","       'Tallinn', 'Ust-Luga', 'Valko-Loviisa', 'Viipuri', 'Vuosaari',\n","       'Vysotsk'], dtype=object)"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UmxxxrSrDvxF","executionInfo":{"status":"ok","timestamp":1607101253686,"user_tz":-120,"elapsed":2162,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"b896b3f7-db13-4622-8e31-4b1fc0ffc662"},"source":["# There is Sillamäe and Sillamae that most likely mean the same place. Lets change them all to Sillamae\n","\n","data[data['Destination'] == 'Sillamäe']['Destination']"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["76    Sillamäe\n","77    Sillamäe\n","Name: Destination, dtype: object"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"id":"YqK6qBgdFK5e"},"source":["data.loc[76:77,['Destination']] = \"Sillamae\""],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"C-s1rX5uG0p9","executionInfo":{"status":"ok","timestamp":1607101401738,"user_tz":-120,"elapsed":668,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"453740b7-958e-4401-ebee-dfd4ce7bcc19"},"source":["# Lets check that worked\n","data['Destination'].unique()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['Hamina', 'Helsinki', 'Kotka', 'Kronshtadt', 'Kunda', 'Muuga',\n","       'Paldiski', 'Porvoo', 'Primorsk', 'Sillamae', 'Tallinn',\n","       'Ust-Luga', 'Valko-Loviisa', 'Viipuri', 'Vuosaari', 'Vysotsk'],\n","      dtype=object)"]},"metadata":{"tags":[]},"execution_count":37}]},{"cell_type":"markdown","metadata":{"id":"YLr0Tsu5H22J"},"source":["**Destination harbor is a categorical variable. It needs to be converted into numerical. Explain, why do you need to make this step? You can use get_dummies from pandas to implement onehot coding for categorical features 1p**\n"]},{"cell_type":"markdown","metadata":{"id":"6cMX_a4Xtbyk"},"source":["Because then we can do numerical opertaions on destinations."]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":419},"id":"ld0zFjIaH4mC","executionInfo":{"status":"ok","timestamp":1607101254203,"user_tz":-120,"elapsed":2657,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"6a368371-0688-48f4-f273-432ff9a0055d"},"source":["dest = pd.get_dummies(data['Destination'])\n","dest"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Hamina</th>\n","      <th>Helsinki</th>\n","      <th>Kotka</th>\n","      <th>Kronshtadt</th>\n","      <th>Kunda</th>\n","      <th>Muuga</th>\n","      <th>Paldiski</th>\n","      <th>Porvoo</th>\n","      <th>Primorsk</th>\n","      <th>Sillamae</th>\n","      <th>Tallinn</th>\n","      <th>Ust-Luga</th>\n","      <th>Valko-Loviisa</th>\n","      <th>Viipuri</th>\n","      <th>Vuosaari</th>\n","      <th>Vysotsk</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>129</th>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>130</th>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>131</th>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>132</th>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>133</th>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>134 rows × 16 columns</p>\n","</div>"],"text/plain":["     Hamina  Helsinki  Kotka  ...  Viipuri  Vuosaari  Vysotsk\n","0         1         0      0  ...        0         0        0\n","1         1         0      0  ...        0         0        0\n","2         1         0      0  ...        0         0        0\n","3         1         0      0  ...        0         0        0\n","4         1         0      0  ...        0         0        0\n","..      ...       ...    ...  ...      ...       ...      ...\n","129       0         0      0  ...        0         0        1\n","130       0         0      0  ...        0         0        1\n","131       0         0      0  ...        0         0        1\n","132       0         0      0  ...        0         0        1\n","133       0         0      0  ...        0         0        1\n","\n","[134 rows x 16 columns]"]},"metadata":{"tags":[]},"execution_count":11}]},{"cell_type":"markdown","metadata":{"id":"Cbx6CGM0Mh1l"},"source":["**Plot Gross tonnage versus the ship Length. Use different colors for different ship types. According to the plot, there is one clear outlier. Find the correct value from marinetraffic.com, and make the correction 1p**\n"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ps4UpUjLOw6B","executionInfo":{"status":"ok","timestamp":1607101254205,"user_tz":-120,"elapsed":2645,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"69ca3ebf-e09a-410e-9441-8bd0fdd75de5"},"source":["# Lets find out the different types of ships\n","\n","data['Ship_type'].unique()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['Cargo', 'Tanker', 'Tug'], dtype=object)"]},"metadata":{"tags":[]},"execution_count":12}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":265},"id":"kpIheTPJMjgM","executionInfo":{"status":"ok","timestamp":1607101254205,"user_tz":-120,"elapsed":2637,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"880f1d6c-1709-4c8d-a219-51e09da9b094"},"source":["# Plot\n","\n","plt.plot(data[data['Ship_type']=='Cargo']['Gross_tonnage'], data[data['Ship_type']=='Cargo']['Length'], 'o', color = 'r', label = 'Cargo')\n","plt.plot(data[data['Ship_type']=='Tanker']['Gross_tonnage'], data[data['Ship_type']=='Tanker']['Length'], 'o', color = 'b', label = 'Tanker')\n","plt.plot(data[data['Ship_type']=='Tug']['Gross_tonnage'], data[data['Ship_type']=='Tug']['Length'], 'o', color = 'g', label = 'Tug')\n","plt.legend()\n","plt.show()"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":80},"id":"Yw2AtWwuNtwI","executionInfo":{"status":"ok","timestamp":1607101254206,"user_tz":-120,"elapsed":2630,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"d328592a-8f32-4ff6-a006-47f5e5f8d9c5"},"source":["# I can see an clear outlier with a tanker that has <100 length and >20000 gross tonnage\n","\n","data.loc[(data['Length']<100) & (data['Ship_type'] == 'Tanker') & (data['Gross_tonnage']>20000),['Gross_tonnage']]"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Gross_tonnage</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>83</th>\n","      <td>30026</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["    Gross_tonnage\n","83          30026"]},"metadata":{"tags":[]},"execution_count":14}]},{"cell_type":"code","metadata":{"id":"h39L-IaISHZM"},"source":["# Lets change 30026 to real value 326\n","\n","data.loc[(data['Length']<100) & (data['Ship_type'] == 'Tanker') & (data['Gross_tonnage']>20000),['Gross_tonnage']] = 326"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":265},"id":"GdUIfF6ZTNYc","executionInfo":{"status":"ok","timestamp":1607101254833,"user_tz":-120,"elapsed":3244,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"4d725b64-582c-4b58-ce9d-25ca08863e01"},"source":["# Lets plot to check what it looks like now\n","\n","plt.plot(data[data['Ship_type']=='Cargo']['Gross_tonnage'], data[data['Ship_type']=='Cargo']['Length'], 'o', color = 'r', label = 'Cargo')\n","plt.plot(data[data['Ship_type']=='Tanker']['Gross_tonnage'], data[data['Ship_type']=='Tanker']['Length'], 'o', color = 'b', label = 'Tanker')\n","plt.plot(data[data['Ship_type']=='Tug']['Gross_tonnage'], data[data['Ship_type']=='Tug']['Length'], 'o', color = 'g', label = 'Tug')\n","plt.legend()\n","plt.show()"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"aLYa3EoeWZTL"},"source":["**It is good to exploit domain knowledge and make some reasonable transformation to the feature values to improve the expected results and/or to avoid redundancy. Find out what gross tonnage means. Make some transformation to Length values to acquire a linear relationship between the transformed length and Gross tonnage values 1p**"]},{"cell_type":"markdown","metadata":{"id":"MFgXUPTkYkzr"},"source":["Gross tonnage is the ships volume times a multplier based on the volume. To be exact:\n","\n","\n","Gross_tonnage = V * (0.2 + 0.02 * log10(V))\n","\n","\n","Volume is height * width * depth (m^3). So we can get linear relationship with the length doing length^3. So:\n","\n","length^3*log10(length^3) ~ gross_tonnage"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":276},"id":"dly3Vdt2WgO6","executionInfo":{"status":"ok","timestamp":1607101254834,"user_tz":-120,"elapsed":3237,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"2728e89a-7272-43ec-89f7-fe88b73c242f"},"source":["# Lets check if we get linear looking data\n","\n","plt.plot(data[data['Ship_type']=='Cargo']['Gross_tonnage'], (data[data['Ship_type']=='Cargo']['Length']**3)*(np.log10(data[data['Ship_type']=='Cargo']['Length']**3)), 'o', color = 'r', label = 'Cargo')\n","plt.plot(data[data['Ship_type']=='Tanker']['Gross_tonnage'], (data[data['Ship_type']=='Tanker']['Length']**3)*(np.log10(data[data['Ship_type']=='Tanker']['Length']**3)), 'o', color = 'b', label = 'Tanker')\n","plt.plot(data[data['Ship_type']=='Tug']['Gross_tonnage'], (data[data['Ship_type']=='Tug']['Length']**3)*(np.log10(data[data['Ship_type']=='Tug']['Length']**3)), 'o', color = 'g', label = 'Tug')\n","plt.legend()\n","plt.show()"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":419},"id":"A7gvUIBQp-Ca","executionInfo":{"status":"ok","timestamp":1607101254835,"user_tz":-120,"elapsed":3230,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"235b0016-11d7-4243-c50f-ff050509085c"},"source":["# Lets add a column for this transformation\n","\n","data['Length_transformed'] = (data['Length']**3)*(np.log10(data['Length']**3))\n","\n","data"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>MMSI</th>\n","      <th>Speed</th>\n","      <th>COG</th>\n","      <th>Destination</th>\n","      <th>Ship_type</th>\n","      <th>Gross_tonnage</th>\n","      <th>Length</th>\n","      <th>Breadth</th>\n","      <th>Length_transformed</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>212209000</td>\n","      <td>10.1377</td>\n","      <td>64.3074</td>\n","      <td>Hamina</td>\n","      <td>Cargo</td>\n","      <td>3416</td>\n","      <td>94.91</td>\n","      <td>15.34</td>\n","      <td>5.071453e+06</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>212436000</td>\n","      <td>13.5256</td>\n","      <td>77.0755</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>6280</td>\n","      <td>116.90</td>\n","      <td>18.00</td>\n","      <td>9.910062e+06</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>219082000</td>\n","      <td>9.9416</td>\n","      <td>74.6762</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>9980</td>\n","      <td>141.20</td>\n","      <td>21.90</td>\n","      <td>1.815643e+07</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>219083000</td>\n","      <td>11.6038</td>\n","      <td>74.7529</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>9980</td>\n","      <td>141.20</td>\n","      <td>21.60</td>\n","      <td>1.815643e+07</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>219426000</td>\n","      <td>11.9203</td>\n","      <td>56.3253</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>3219</td>\n","      <td>99.90</td>\n","      <td>15.00</td>\n","      <td>5.980718e+06</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>129</th>\n","      <td>273374820</td>\n","      <td>10.0396</td>\n","      <td>74.6253</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>4979</td>\n","      <td>139.90</td>\n","      <td>16.70</td>\n","      <td>1.762655e+07</td>\n","    </tr>\n","    <tr>\n","      <th>130</th>\n","      <td>273385070</td>\n","      <td>9.3507</td>\n","      <td>74.5454</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>4979</td>\n","      <td>139.90</td>\n","      <td>16.94</td>\n","      <td>1.762655e+07</td>\n","    </tr>\n","    <tr>\n","      <th>131</th>\n","      <td>273388150</td>\n","      <td>9.7668</td>\n","      <td>68.7159</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>5075</td>\n","      <td>140.85</td>\n","      <td>16.86</td>\n","      <td>1.801271e+07</td>\n","    </tr>\n","    <tr>\n","      <th>132</th>\n","      <td>636092755</td>\n","      <td>11.1554</td>\n","      <td>73.7013</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>23240</td>\n","      <td>183.00</td>\n","      <td>27.37</td>\n","      <td>4.159621e+07</td>\n","    </tr>\n","    <tr>\n","      <th>133</th>\n","      <td>357100000</td>\n","      <td>11.2703</td>\n","      <td>59.3888</td>\n","      <td>Vysotsk</td>\n","      <td>Cargo</td>\n","      <td>43717</td>\n","      <td>229.04</td>\n","      <td>32.31</td>\n","      <td>8.506501e+07</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>134 rows × 9 columns</p>\n","</div>"],"text/plain":["          MMSI    Speed      COG  ...  Length Breadth  Length_transformed\n","0    212209000  10.1377  64.3074  ...   94.91   15.34        5.071453e+06\n","1    212436000  13.5256  77.0755  ...  116.90   18.00        9.910062e+06\n","2    219082000   9.9416  74.6762  ...  141.20   21.90        1.815643e+07\n","3    219083000  11.6038  74.7529  ...  141.20   21.60        1.815643e+07\n","4    219426000  11.9203  56.3253  ...   99.90   15.00        5.980718e+06\n","..         ...      ...      ...  ...     ...     ...                 ...\n","129  273374820  10.0396  74.6253  ...  139.90   16.70        1.762655e+07\n","130  273385070   9.3507  74.5454  ...  139.90   16.94        1.762655e+07\n","131  273388150   9.7668  68.7159  ...  140.85   16.86        1.801271e+07\n","132  636092755  11.1554  73.7013  ...  183.00   27.37        4.159621e+07\n","133  357100000  11.2703  59.3888  ...  229.04   32.31        8.506501e+07\n","\n","[134 rows x 9 columns]"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"O-qAVjP1hOW0"},"source":["**The numerical variables have quite different ranges. To ensure that all variables can have the same importance on the model, perform Z-score standardization. Perform it for speed, transformed length, and breadth 1p**"]},{"cell_type":"code","metadata":{"id":"via0OpTKkPMd"},"source":["data_std = data.copy()\n","\n","data_std['Speed'] = (data_std['Speed'] - data_std['Speed'].mean()) / data_std['Speed'].std()\n","data_std['Length_transformed'] = (data_std['Length_transformed'] - data_std['Length_transformed'].mean()) / data_std['Length_transformed'].std()\n","data_std['Breadth'] = (data_std['Breadth'] - data_std['Breadth'].mean()) / data_std['Breadth'].std()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":419},"id":"mz2A72c7p0Ry","executionInfo":{"status":"ok","timestamp":1607101254836,"user_tz":-120,"elapsed":3217,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"e3a58c76-c5af-4a9d-85f8-83bc566f182f"},"source":["data_std"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>MMSI</th>\n","      <th>Speed</th>\n","      <th>COG</th>\n","      <th>Destination</th>\n","      <th>Ship_type</th>\n","      <th>Gross_tonnage</th>\n","      <th>Length</th>\n","      <th>Breadth</th>\n","      <th>Length_transformed</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>212209000</td>\n","      <td>-0.160696</td>\n","      <td>64.3074</td>\n","      <td>Hamina</td>\n","      <td>Cargo</td>\n","      <td>3416</td>\n","      <td>94.91</td>\n","      <td>-0.487276</td>\n","      <td>-0.557527</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>212436000</td>\n","      <td>1.574301</td>\n","      <td>77.0755</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>6280</td>\n","      <td>116.90</td>\n","      <td>-0.219871</td>\n","      <td>-0.410920</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>219082000</td>\n","      <td>-0.261122</td>\n","      <td>74.6762</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>9980</td>\n","      <td>141.20</td>\n","      <td>0.172188</td>\n","      <td>-0.161060</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>219083000</td>\n","      <td>0.590117</td>\n","      <td>74.7529</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>9980</td>\n","      <td>141.20</td>\n","      <td>0.142030</td>\n","      <td>-0.161060</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>219426000</td>\n","      <td>0.752202</td>\n","      <td>56.3253</td>\n","      <td>Hamina</td>\n","      <td>Tanker</td>\n","      <td>3219</td>\n","      <td>99.90</td>\n","      <td>-0.521456</td>\n","      <td>-0.529977</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>129</th>\n","      <td>273374820</td>\n","      <td>-0.210935</td>\n","      <td>74.6253</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>4979</td>\n","      <td>139.90</td>\n","      <td>-0.350558</td>\n","      <td>-0.177115</td>\n","    </tr>\n","    <tr>\n","      <th>130</th>\n","      <td>273385070</td>\n","      <td>-0.563732</td>\n","      <td>74.5454</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>4979</td>\n","      <td>139.90</td>\n","      <td>-0.326431</td>\n","      <td>-0.177115</td>\n","    </tr>\n","    <tr>\n","      <th>131</th>\n","      <td>273388150</td>\n","      <td>-0.350640</td>\n","      <td>68.7159</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>5075</td>\n","      <td>140.85</td>\n","      <td>-0.334473</td>\n","      <td>-0.165415</td>\n","    </tr>\n","    <tr>\n","      <th>132</th>\n","      <td>636092755</td>\n","      <td>0.360484</td>\n","      <td>73.7013</td>\n","      <td>Vysotsk</td>\n","      <td>Tanker</td>\n","      <td>23240</td>\n","      <td>183.00</td>\n","      <td>0.722077</td>\n","      <td>0.549150</td>\n","    </tr>\n","    <tr>\n","      <th>133</th>\n","      <td>357100000</td>\n","      <td>0.419326</td>\n","      <td>59.3888</td>\n","      <td>Vysotsk</td>\n","      <td>Cargo</td>\n","      <td>43717</td>\n","      <td>229.04</td>\n","      <td>1.218685</td>\n","      <td>1.866228</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>134 rows × 9 columns</p>\n","</div>"],"text/plain":["          MMSI     Speed      COG  ...  Length   Breadth  Length_transformed\n","0    212209000 -0.160696  64.3074  ...   94.91 -0.487276           -0.557527\n","1    212436000  1.574301  77.0755  ...  116.90 -0.219871           -0.410920\n","2    219082000 -0.261122  74.6762  ...  141.20  0.172188           -0.161060\n","3    219083000  0.590117  74.7529  ...  141.20  0.142030           -0.161060\n","4    219426000  0.752202  56.3253  ...   99.90 -0.521456           -0.529977\n","..         ...       ...      ...  ...     ...       ...                 ...\n","129  273374820 -0.210935  74.6253  ...  139.90 -0.350558           -0.177115\n","130  273385070 -0.563732  74.5454  ...  139.90 -0.326431           -0.177115\n","131  273388150 -0.350640  68.7159  ...  140.85 -0.334473           -0.165415\n","132  636092755  0.360484  73.7013  ...  183.00  0.722077            0.549150\n","133  357100000  0.419326  59.3888  ...  229.04  1.218685            1.866228\n","\n","[134 rows x 9 columns]"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"IC45j4lHl0Sa"},"source":["## Classification accuracy with random training and test sets"]},{"cell_type":"markdown","metadata":{"id":"xZOcgo3Gl0Sa"},"source":["Predict the **ship type** using **speed, destination, transformed length, and breadth** as features. Find an estimation for the classification accuracy (number of correctly classified ships to the total number of ships) using *random training and test sets*. <br>\n"," - Produce training and test data **1p**\n","     - Gather the normalized features and one-hot-coded destination columns as array __X__ (input variables), and the ship type as array **y** (output variable)     \n","     - Divide the data randomly into training (20%) and test (80%) sets\n","     - Do you need to use stratification? Explain your decision\n"," - Train the model and test its performance **1p**\n","     - Use kNN classifier with k=3\n","     - Print out the confusion matrix. How does the model perform with different ship types?\n","     - What is the (total) classification accuracy?\n"," - Repeat the calculation 1000 times with different split of training/test data, and make a histogram of the results for classification accuracy **1p**\n"," - Discuss your results **1p**"]},{"cell_type":"markdown","metadata":{"id":"ki03tbUxtjoD"},"source":["**Gather the normalized features and one-hot-coded destination columns as array __X__ (input variables), and the ship type as array y (output variable)**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":419},"id":"IaK8iG5Ptizr","executionInfo":{"status":"ok","timestamp":1607101254838,"user_tz":-120,"elapsed":3210,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"a639e251-0851-4444-bdb6-b824a838ee7a"},"source":["X = data_std[['Speed','Length_transformed','Breadth']]\n","X = pd.concat([X,dest], axis=1)\n","X"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Speed</th>\n","      <th>Length_transformed</th>\n","      <th>Breadth</th>\n","      <th>Hamina</th>\n","      <th>Helsinki</th>\n","      <th>Kotka</th>\n","      <th>Kronshtadt</th>\n","      <th>Kunda</th>\n","      <th>Muuga</th>\n","      <th>Paldiski</th>\n","      <th>Porvoo</th>\n","      <th>Primorsk</th>\n","      <th>Sillamae</th>\n","      <th>Tallinn</th>\n","      <th>Ust-Luga</th>\n","      <th>Valko-Loviisa</th>\n","      <th>Viipuri</th>\n","      <th>Vuosaari</th>\n","      <th>Vysotsk</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>-0.160696</td>\n","      <td>-0.557527</td>\n","      <td>-0.487276</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1.574301</td>\n","      <td>-0.410920</td>\n","      <td>-0.219871</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>-0.261122</td>\n","      <td>-0.161060</td>\n","      <td>0.172188</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>0.590117</td>\n","      <td>-0.161060</td>\n","      <td>0.142030</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>0.752202</td>\n","      <td>-0.529977</td>\n","      <td>-0.521456</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>129</th>\n","      <td>-0.210935</td>\n","      <td>-0.177115</td>\n","      <td>-0.350558</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>130</th>\n","      <td>-0.563732</td>\n","      <td>-0.177115</td>\n","      <td>-0.326431</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>131</th>\n","      <td>-0.350640</td>\n","      <td>-0.165415</td>\n","      <td>-0.334473</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>132</th>\n","      <td>0.360484</td>\n","      <td>0.549150</td>\n","      <td>0.722077</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>133</th>\n","      <td>0.419326</td>\n","      <td>1.866228</td>\n","      <td>1.218685</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>134 rows × 19 columns</p>\n","</div>"],"text/plain":["        Speed  Length_transformed   Breadth  ...  Viipuri  Vuosaari  Vysotsk\n","0   -0.160696           -0.557527 -0.487276  ...        0         0        0\n","1    1.574301           -0.410920 -0.219871  ...        0         0        0\n","2   -0.261122           -0.161060  0.172188  ...        0         0        0\n","3    0.590117           -0.161060  0.142030  ...        0         0        0\n","4    0.752202           -0.529977 -0.521456  ...        0         0        0\n","..        ...                 ...       ...  ...      ...       ...      ...\n","129 -0.210935           -0.177115 -0.350558  ...        0         0        1\n","130 -0.563732           -0.177115 -0.326431  ...        0         0        1\n","131 -0.350640           -0.165415 -0.334473  ...        0         0        1\n","132  0.360484            0.549150  0.722077  ...        0         0        1\n","133  0.419326            1.866228  1.218685  ...        0         0        1\n","\n","[134 rows x 19 columns]"]},"metadata":{"tags":[]},"execution_count":21}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":419},"id":"8NZYrLXD1rhm","executionInfo":{"status":"ok","timestamp":1607101255201,"user_tz":-120,"elapsed":3565,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"07d2fe01-498a-4292-8d6e-a90482f3c90c"},"source":["y = data_std[['Ship_type']]\n","y"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Ship_type</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>Cargo</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>129</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>130</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>131</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>132</th>\n","      <td>Tanker</td>\n","    </tr>\n","    <tr>\n","      <th>133</th>\n","      <td>Cargo</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>134 rows × 1 columns</p>\n","</div>"],"text/plain":["    Ship_type\n","0       Cargo\n","1      Tanker\n","2      Tanker\n","3      Tanker\n","4      Tanker\n","..        ...\n","129    Tanker\n","130    Tanker\n","131    Tanker\n","132    Tanker\n","133     Cargo\n","\n","[134 rows x 1 columns]"]},"metadata":{"tags":[]},"execution_count":22}]},{"cell_type":"markdown","metadata":{"id":"nfSFv9Zf9tDT"},"source":["**Divide the data randomly into training (20%) and test (80%) sets**\n","\n","**Do you need to use stratification? Explain your decision**\n"]},{"cell_type":"markdown","metadata":{"id":"KyO5YrbXwaX8"},"source":["\n","Yes. Randomizing the test and train doesn't guarantee that all types of ships are represented in the training data."]},{"cell_type":"code","metadata":{"id":"7HrASZhP2K0W"},"source":["# Lets divide data 20% training and 80% tests using stratify\n","from sklearn.model_selection import train_test_split\n","\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, stratify = y, random_state = 1) # Lets put random state so we can get same result with reruns\n","\n","#print(X_train)\n","#print(y_train)\n","#print(X_test)\n","#print(y_test)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"hVlrC2AKCHeG"},"source":["**Use kNN classifier with k=3**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"juKnfvVACGfH","executionInfo":{"status":"ok","timestamp":1607101255202,"user_tz":-120,"elapsed":3551,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"402faa08-986a-4be7-ca17-208b91eda920"},"source":["from sklearn.neighbors import KNeighborsClassifier\n","\n","neigh = KNeighborsClassifier(n_neighbors=3)\n","neigh.fit(X_train, np.ravel(y_train))"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n","                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,\n","                     weights='uniform')"]},"metadata":{"tags":[]},"execution_count":24}]},{"cell_type":"markdown","metadata":{"id":"yeTR4o5_xPIS"},"source":["**Print out the confusion matrix. How does the model perform with different ship types?**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"AFxSmkd9Gpau","executionInfo":{"status":"ok","timestamp":1607101255202,"user_tz":-120,"elapsed":3544,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"43a07cb5-a716-4a71-cffc-11e1695a1257"},"source":["from sklearn.metrics import confusion_matrix\n","y_pred = neigh.predict(X_test)\n","\n","confusion_matrix(y_test, y_pred, labels=['Cargo','Tanker','Tug'])"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[34, 19,  1],\n","       [ 6, 41,  0],\n","       [ 6,  0,  1]])"]},"metadata":{"tags":[]},"execution_count":25}]},{"cell_type":"markdown","metadata":{"id":"s4mOPxApH8xY"},"source":["Cargo with 63% accuracy.\n","Tanker with 87% accuracy.\n","Tug with 14% accuracy.\n","\n","Performed best with Tanker."]},{"cell_type":"markdown","metadata":{"id":"0K6BzRlPxV-N"},"source":["**What is the (total) classification accuracy?**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"thdl7671G0bk","executionInfo":{"status":"ok","timestamp":1607101255202,"user_tz":-120,"elapsed":3537,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"55c577ee-3588-41b3-9335-c9ac73485afb"},"source":["# Total accuracy\n","\n","neigh.score(X_test, np.ravel(y_test))"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.7037037037037037"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"markdown","metadata":{"id":"q_gB_4N1KNU2"},"source":["**Repeat the calculation 1000 times with different split of training/test data, and make a histogram of the results for classification accuracy 1p**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":282},"id":"oFfwi1t8KRp7","executionInfo":{"status":"ok","timestamp":1607101263904,"user_tz":-120,"elapsed":12231,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"e8945eb7-1c03-417f-b9d5-4c9a09430198"},"source":["# Test size 0.8\n","accuracy = np.zeros(1000)\n","for i in range(1000):\n","  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, stratify = y)\n","\n","  neigh = KNeighborsClassifier(n_neighbors=3)\n","  neigh.fit(X_train, np.ravel(y_train))\n","\n","  accuracy[i] = neigh.score(X_test, np.ravel(y_test))\n","\n","plt.hist(accuracy)\n","plt.show()\n","\n","print(np.mean(accuracy))"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAOp0lEQVR4nO3df6zddX3H8edrVDGZbpb12rDSeZmpcSXZ0N0wMvcHC4mUkqyQJaQs0+rYahZcNPGfqn9IlpB1yYTNzJEUJdZFZcQfoQvsB2MsRjPUiyJQGFKhhHaVXn/MHzFho773x/02Hspt77nnB+ccPs9HcnK+5/P9fs73/bmf5r7u9/s939NUFZKk9vzcpAuQJE2GASBJjTIAJKlRBoAkNcoAkKRGrZt0AQAbNmyo+fn5SZchSTPl/vvv/05VzQ3afyoCYH5+nsXFxUmXIUkzJclTw/T3FJAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDVqKu4ElvRC83vunMh+D++9YiL71YvPIwBJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWrUqgGQZHOSe5M8kuRgknd37dcnOZrkge6xvafP+5IcSvJYksvGOQBJ0mD6+SqI54D3VtXXkrwKuD/J3d26m6rqr3o3TrIV2AlcAPwy8G9JXl9VJ0ZZuCRpOKseAVTVsar6Wrf8I+BRYNMZuuwAbquqZ6vqSeAQcNEoipUkjc6argEkmQfeCHy5a3pXkgeT3Jpkfde2CXi6p9sRzhwYkqQJ6DsAkrwS+Czwnqr6IXAz8DrgQuAY8KG17DjJ7iSLSRaXlpbW0lWSNAJ9BUCSl7H8y/+TVfU5gKp6pqpOVNVPgVv42Wmeo8Dmnu7ndW3PU1X7qmqhqhbm5uaGGYMkaQD9fAoowMeAR6vqxp72c3s2uwp4uFs+AOxMcnaS84EtwFdGV7IkaRT6+RTQm4G3Ag8leaBrez9wTZILgQIOA+8EqKqDSW4HHmH5E0TX+QkgSZo+qwZAVX0RyAqr7jpDnxuAG4aoS5I0Zt4JLEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjernqyAkNWR+z50T2/fhvVdMbN8t8ghAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNWjUAkmxOcm+SR5IcTPLurv2cJHcnebx7Xt+1J8mHkxxK8mCSN417EJKktevnCOA54L1VtRW4GLguyVZgD3BPVW0B7uleA1wObOkeu4GbR161JGloqwZAVR2rqq91yz8CHgU2ATuA/d1m+4Eru+UdwCdq2X3Aq5OcO/LKJUlDWdM1gCTzwBuBLwMbq+pYt+rbwMZueRPwdE+3I13bqe+1O8liksWlpaU1li1JGlbfAZDklcBngfdU1Q9711VVAbWWHVfVvqpaqKqFubm5tXSVJI1AXwGQ5GUs//L/ZFV9rmt+5uSpne75eNd+FNjc0/28rk2SNEX6+RRQgI8Bj1bVjT2rDgC7uuVdwB097W/rPg10MfCDnlNFkqQpsa6Pbd4MvBV4KMkDXdv7gb3A7UmuBZ4Cru7W3QVsBw4BPwHeMdKKJUkjsWoAVNUXgZxm9aUrbF/AdUPWJUkaM+8ElqRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktSofr4LSGra/J47J12CNBYeAUhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNWjUAktya5HiSh3vark9yNMkD3WN7z7r3JTmU5LEkl42rcEnScPo5Avg4sG2F9puq6sLucRdAkq3ATuCCrs/fJTlrVMVKkkZn1QCoqi8A3+vz/XYAt1XVs1X1JHAIuGiI+iRJYzLMNYB3JXmwO0W0vmvbBDzds82Rru0FkuxOsphkcWlpaYgyJEmDGDQAbgZeB1wIHAM+tNY3qKp9VbVQVQtzc3MDliFJGtRAAVBVz1TViar6KXALPzvNcxTY3LPpeV2bJGnKDBQASc7teXkVcPITQgeAnUnOTnI+sAX4ynAlSpLGYd1qGyT5NHAJsCHJEeCDwCVJLgQKOAy8E6CqDia5HXgEeA64rqpOjKd0SdIwVg2AqrpmheaPnWH7G4AbhilKkjR+3gksSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVGrfhWEJL1Y5vfcOZH9Ht57xUT2O2keAUhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJatSqAZDk1iTHkzzc03ZOkruTPN49r+/ak+TDSQ4leTDJm8ZZvCRpcP0cAXwc2HZK2x7gnqraAtzTvQa4HNjSPXYDN4+mTEnSqK0aAFX1BeB7pzTvAPZ3y/uBK3vaP1HL7gNeneTcURUrSRqdQa8BbKyqY93yt4GN3fIm4Ome7Y50bZKkKTP0ReCqKqDW2i/J7iSLSRaXlpaGLUOStEaDBsAzJ0/tdM/Hu/ajwOae7c7r2l6gqvZV1UJVLczNzQ1YhiRpUIMGwAFgV7e8C7ijp/1t3aeBLgZ+0HOqSJI0RdattkGSTwOXABuSHAE+COwFbk9yLfAUcHW3+V3AduAQ8BPgHWOoWZI0AqsGQFVdc5pVl66wbQHXDVuUJGn8vBNYkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGrXofgDQN5vfcOekSpJccjwAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKj1g3TOclh4EfACeC5qlpIcg7wD8A8cBi4uqq+P1yZkqRRG8URwO9W1YVVtdC93gPcU1VbgHu615KkKTOOU0A7gP3d8n7gyjHsQ5I0pGEDoIB/TXJ/kt1d28aqOtYtfxvYuFLHJLuTLCZZXFpaGrIMSdJaDXUNAPidqjqa5DXA3Un+q3dlVVWSWqljVe0D9gEsLCysuI0kaXyGOgKoqqPd83Hg88BFwDNJzgXono8PW6QkafQGDoAkP5/kVSeXgbcADwMHgF3dZruAO4YtUpI0esOcAtoIfD7Jyff5VFX9c5KvArcnuRZ4Crh6+DIlSaM2cABU1RPAb6zQ/l3g0mGKkiSN37AXgdWY+T13TroESSPiV0FIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQo7wOQ1LxJ3t9yeO8VE9u3RwCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRnkj2AzyP2WRNAoeAUhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqO8E3gI3pEraZZ5BCBJjTIAJKlRYzsFlGQb8DfAWcBHq2rvOPbjaRhJGsxYjgCSnAV8BLgc2Apck2TrOPYlSRrMuE4BXQQcqqonqup/gduAHWPalyRpAOM6BbQJeLrn9RHgt3o3SLIb2N29/HGSxwbc1wbgOwP2nVaOaTY4ptkxtePKXw7cdQPw2mH2PbGPgVbVPmDfsO+TZLGqFkZQ0tRwTLPBMc2Ol+K4ujHND/Me4zoFdBTY3PP6vK5NkjQlxhUAXwW2JDk/ycuBncCBMe1LkjSAsZwCqqrnkrwL+BeWPwZ6a1UdHMe+GMFppCnkmGaDY5odL8VxDX8KvapGUYgkacZ4J7AkNcoAkKRGTXUAJNmW5LEkh5LsWWH925MsJXmge/xxz7pdSR7vHrte3MpPb8gxnehpn5qL6quNqdvm6iSPJDmY5FM97TM5T902pxvTTM5Tkpt66v5mkv/pWTeT87TKmGZ1nn4lyb1Jvp7kwSTbe9a9r+v3WJLLVt1ZVU3lg+WLx98CfhV4OfANYOsp27wd+NsV+p4DPNE9r++W18/ymLp1P570GAYc0xbg6yfnAHjNS2CeVhzTLM/TKdv/Gcsf3pjpeTrdmGZ5nli++Pun3fJW4HDP8jeAs4Hzu/c560z7m+YjgGG+TuIy4O6q+l5VfR+4G9g2pjrX4qX4FRn9jOlPgI90c0FVHe/aZ3meTjemabXWf3vXAJ/ulmd5nnr1jmla9TOmAn6hW/5F4L+75R3AbVX1bFU9CRzq3u+0pjkAVvo6iU0rbPf73WHQZ5KcvPms374vtmHGBPCKJItJ7kty5Vgr7V8/Y3o98PokX+pq37aGvpMwzJhgducJgCSvZfkvyH9fa98X2TBjgtmdp+uBP0xyBLiL5SObfvs+zzQHQD/+EZivql9n+a+S/ROuZxTONKbX1vLt7H8A/HWS102iwAGsY/mUySUs/xV2S5JXT7Si4Z1pTLM6TyftBD5TVScmXcgIrTSmWZ2na4CPV9V5wHbg75MM9Lt8mgNg1a+TqKrvVtWz3cuPAr/Zb98JGWZMVNXR7vkJ4D+AN46z2D7187M+Ahyoqv/rDk2/yfIvz5mdJ04/plmep5N28vxTJbM8TyedOqZZnqdrgdsBquo/gVew/MVwa5+nSV/0OMPFkHUsX2w6n59dDLnglG3O7Vm+CrivWz4HeJLlC1bru+VzZnxM64Gzu+UNwOOc4YLXlI1pG7C/p/angV+a8Xk63Zhmdp667d4AHKa7SbRrm9l5OsOYZnaegH8C3t4t/xrL1wACXMDzLwI/wSoXgSc62D5+GNtZ/svqW8AHurY/B36vW/4L4GA36HuBN/T0/SOWL4IcAt4x6bEMOybgt4GHuvaHgGsnPZY1jCnAjcAjXe07XwLztOKYZnmeutfXA3tX6DuT83S6Mc3yPLH8aZ8vdbU/ALylp+8Hun6PAZevti+/CkKSGjXN1wAkSWNkAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRG/T8tcGIs+lLfsgAAAABJRU5ErkJggg==\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}},{"output_type":"stream","text":["0.6838055555555557\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":282},"id":"chKlN4gWa6D6","executionInfo":{"status":"ok","timestamp":1607101271503,"user_tz":-120,"elapsed":19821,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"bdd40a37-46ad-495b-aa34-fbc9672b01f7"},"source":["# Test size 0.5\n","accuracy = np.zeros(1000)\n","for i in range(1000):\n","  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify = y)\n","\n","  neigh = KNeighborsClassifier(n_neighbors=3)\n","  neigh.fit(X_train, np.ravel(y_train))\n","\n","  accuracy[i] = neigh.score(X_test, np.ravel(y_test))\n","\n","plt.hist(accuracy)\n","plt.show()\n","\n","print(np.mean(accuracy))"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAOlUlEQVR4nO3dbYxc1X3H8e+vkBCpSRtTOxY1TpZERi20KqQrQptWokINBNSaqBIyUhOXojqqoEqkvKhJXgRVQnKlAlLUlMoRKKZKoCgPxRL0gRLaKFHzsBAC2JRgwAi7Dt6EJBClosX598VcN4NZe2d2ZnZ2T74faTRnzr137vnvtX6+e+7M3VQVkqS2/My0ByBJGj/DXZIaZLhLUoMMd0lqkOEuSQ06edoDAFi7dm3NzMxMexiStKo88MAD36mqdQstWxHhPjMzw9zc3LSHIUmrSpJnjrfMaRlJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWrQiviGqrSYme13T23f+3dcOrV9S0vlmbskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSgRcM9ycYk9yfZm2RPkg90/dclOZjkoe5xSd821ybZl+TxJBdNsgBJ0qsNcuOwl4EPVdWDSd4APJDk3m7ZTVX1V/0rJzkL2AKcDfwi8K9JzqyqI+McuCTp+BY9c6+qQ1X1YNd+EXgM2HCCTTYDd1TVS1X1NLAPOG8cg5UkDWaoOfckM8C5wFe7rmuSPJzk1iRrur4NwLN9mx1ggf8MkmxLMpdkbn5+fuiBS5KOb+BwT/J64LPAB6vqBeBm4G3AOcAh4IZhdlxVO6tqtqpm161bN8ymkqRFDBTuSV5DL9g/VVWfA6iq56rqSFX9GPgEP5l6OQhs7Nv89K5PkrRMBvm0TIBbgMeq6sa+/tP6VnsP8GjX3g1sSXJKkjOATcDXxjdkSdJiBvm0zDuB9wKPJHmo6/swcEWSc4AC9gPvB6iqPUnuBPbS+6TN1X5SRpKW16LhXlVfArLAontOsM31wPUjjEuSNAK/oSpJDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDBvljHdJPtZntd09lv/t3XDqV/aoNnrlLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1aNFwT7Ixyf1J9ibZk+QDXf+pSe5N8kT3vKbrT5KPJdmX5OEkb590EZKkVxrkzP1l4ENVdRZwPnB1krOA7cB9VbUJuK97DfBuYFP32AbcPPZRS5JOaNFwr6pDVfVg134ReAzYAGwGdnWr7QIu69qbgduq5yvAG5OcNvaRS5KOa6g59yQzwLnAV4H1VXWoW/RtYH3X3gA827fZga7v2PfalmQuydz8/PyQw5YkncjAf4kpyeuBzwIfrKoXkvz/sqqqJDXMjqtqJ7ATYHZ2dqhtNT3T+qtEkoYz0Jl7ktfQC/ZPVdXnuu7njk63dM+Hu/6DwMa+zU/v+iRJy2SQT8sEuAV4rKpu7Fu0G9jatbcCd/X1v6/71Mz5wA/6pm8kSctgkGmZdwLvBR5J8lDX92FgB3BnkquAZ4DLu2X3AJcA+4AfAVeOdcSSpEUtGu5V9SUgx1l84QLrF3D1iOOSJI3Ab6hKUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBi0a7kluTXI4yaN9fdclOZjkoe5xSd+ya5PsS/J4kosmNXBJ0vENcub+SeDiBfpvqqpzusc9AEnOArYAZ3fb/E2Sk8Y1WEnSYBYN96r6IvD8gO+3Gbijql6qqqeBfcB5I4xPkrQEo8y5X5Pk4W7aZk3XtwF4tm+dA13fqyTZlmQuydz8/PwIw5AkHWup4X4z8DbgHOAQcMOwb1BVO6tqtqpm161bt8RhSJIWsqRwr6rnqupIVf0Y+AQ/mXo5CGzsW/X0rk+StIyWFO5JTut7+R7g6CdpdgNbkpyS5AxgE/C10YYoSRrWyYutkOR24AJgbZIDwEeBC5KcAxSwH3g/QFXtSXInsBd4Gbi6qo5MZuiSpONZNNyr6ooFum85wfrXA9ePMihJ0mj8hqokNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYt+mf2JE3HzPa7p7Lf/Tsuncp+NV6euUtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lq0KLhnuTWJIeTPNrXd2qSe5M80T2v6fqT5GNJ9iV5OMnbJzl4SdLCBrm3zCeBvwZu6+vbDtxXVTuSbO9e/znwbmBT93gHcHP3rDGa1j1HJK0ei565V9UXgeeP6d4M7Orau4DL+vpvq56vAG9Mctq4BitJGsxS59zXV9Whrv1tYH3X3gA827fega7vVZJsSzKXZG5+fn6Jw5AkLWTkC6pVVUAtYbudVTVbVbPr1q0bdRiSpD5LDffnjk63dM+Hu/6DwMa+9U7v+iRJy2ip4b4b2Nq1twJ39fW/r/vUzPnAD/qmbyRJy2TRT8skuR24AFib5ADwUWAHcGeSq4BngMu71e8BLgH2AT8CrpzAmCVJi1g03KvqiuMsunCBdQu4etRBSZJG4zdUJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIadPIoGyfZD7wIHAFerqrZJKcCfw/MAPuBy6vqe6MNU5I0jHGcuf9OVZ1TVbPd6+3AfVW1Cbivey1JWkaTmJbZDOzq2ruAyyawD0nSCYwa7gX8S5IHkmzr+tZX1aGu/W1g/UIbJtmWZC7J3Pz8/IjDkCT1G2nOHfitqjqY5E3AvUn+s39hVVWSWmjDqtoJ7ASYnZ1dcB1J0tKMdOZeVQe758PA54HzgOeSnAbQPR8edZCSpOEsOdyT/GySNxxtA+8CHgV2A1u71bYCd406SEnScEaZllkPfD7J0ff5dFX9U5KvA3cmuQp4Brh89GFKkoax5HCvqqeAX1ug/7vAhaMMSpI0mlEvqP5Um9l+97SHIEkL8vYDktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDfKukJJeYZp3O92/49Kp7bs1nrlLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGrTqv8Q0zS9cSNJK5Zm7JDXIcJekBhnuktQgw12SGmS4S1KDVv2nZSS1Y1qffmvxVsMTO3NPcnGSx5PsS7J9UvuRJL3aRM7ck5wEfBz4XeAA8PUku6tq7yT2J0mjaPEPlEzqzP08YF9VPVVV/wPcAWye0L4kSceY1Jz7BuDZvtcHgHf0r5BkG7Cte/nDJI+Pad9rge+M6b1WihZrAutaTVqsCVZAXfnLkTZ/y/EWTO2CalXtBHaO+32TzFXV7Ljfd5parAmsazVpsSZoty6Y3LTMQWBj3+vTuz5J0jKYVLh/HdiU5IwkrwW2ALsntC9J0jEmMi1TVS8nuQb4Z+Ak4Naq2jOJfS1g7FM9K0CLNYF1rSYt1gTt1kWqatpjkCSNmbcfkKQGGe6S1KBVE+6D3M4gyeVJ9ibZk+TTff1bkzzRPbYu36gXN2JdR5I81D1W1AXrxepKclPf2L+V5Pt9y1bk8RqxptV8rN6c5P4k30jycJJL+pZd2233eJKLlnfkJ7bUupLMJPnvvuP1t8s/+jGoqhX/oHdR9kngrcBrgW8CZx2zzibgG8Ca7vWbuudTgae65zVde820axq1rq79w2nXsNS6jln/z+hddF+xx2uUmlb7saJ30fFPu/ZZwP6+9jeBU4Azuvc5ado1jaGuGeDRadcw6mO1nLkPcjuDPwE+XlXfA6iqw13/RcC9VfV8t+xe4OJlGvdiRqlrJRv29hNXALd37ZV6vEapaSUbpK4Cfq5r/zzwX117M3BHVb1UVU8D+7r3WwlGqasJqyXcF7qdwYZj1jkTODPJl5N8JcnFQ2w7LaPUBfC6JHNd/2WTHuwQBv6ZJ3kLvbO+Lwy77TIbpSZY3cfqOuAPkxwA7qH3W8mg207LKHUBnNFN1/x7kt+e6EgnpKX7uZ9MbwrjAnrfiP1ikl+d6ojGY8G6qur7wFuq6mCStwJfSPJIVT05xbEuxRbgM1V1ZNoDGaOFalrNx+oK4JNVdUOS3wD+LsmvTHtQY3C8ug4Bb66q7yb5deAfkpxdVS9MdbRDWi1n7oPczuAAsLuq/rf7FfFb9EJxJd8KYZS6qKqD3fNTwL8B5056wAMa5me+hVdOX6zU4zVKTav9WF0F3AlQVf8BvI7eDbdW6rGCEerqppm+2/U/QG/u/syJj3jcpj3pP8iD3tnrU/R+1T16ceTsY9a5GNjVtdfS+5XsF+hdmHua3sW5NV371GnXNIa61gCn9PU/wQku8K20urr1fgnYT/dluq5vRR6vEWta1ccK+Efgj7r2L9Obmw5wNq+8oPoUK+eC6ih1rTtaB70LsgdXwr/BoX8G0x7AEAfrEnpnrU8CH+n6/gL4/a4d4EZgL/AIsKVv2z+md7FnH3DltGsZR13Ab3avv9k9XzXtWoapq3t9HbBjgW1X5PFaak2r/VjR+yTJl7vxPwS8q2/bj3TbPQ68e9q1jKMu4A+APV3fg8DvTbuWpTy8/YAkNWi1zLlLkoZguEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QG/R+kqFcRQSMOnAAAAABJRU5ErkJggg==\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}},{"output_type":"stream","text":["0.7231641791044776\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":282},"id":"dlOsrMEVbA6_","executionInfo":{"status":"ok","timestamp":1607101277715,"user_tz":-120,"elapsed":26025,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"8862b047-6b56-452a-fece-4e0f918bcc08"},"source":["# Test size 0.2\n","accuracy = np.zeros(1000)\n","for i in range(1000):\n","  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)\n","\n","  neigh = KNeighborsClassifier(n_neighbors=3)\n","  neigh.fit(X_train, np.ravel(y_train))\n","\n","  accuracy[i] = neigh.score(X_test, np.ravel(y_test))\n","\n","plt.hist(accuracy)\n","plt.show()\n","\n","print(np.mean(accuracy))"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAQZElEQVR4nO3dcayddX3H8fdngGimG2CvTdcWy1yNw2UWd4csLpFBnAiJRbeRkkyrYdYtODVxZsX9oS4jwWRKYuZI6mBUo2CHOjphcwxZjIuAFymVFtEKZbRWekVAiRkb9bs/7kM8bW97zr3n3nPKz/crOTnP83t+z3m+99dzP/fp7z7nuakqJElt+YVxFyBJWniGuyQ1yHCXpAYZ7pLUIMNdkhp0/LgLAFiyZEmtWrVq3GVI0rPKXXfd9YOqmpht2zER7qtWrWJqamrcZUjSs0qSh460zWkZSWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoP6hnuS5ya5M8k9SXYk+VDXfm2SB5Ns6x5ruvYk+ViSXUm2J3nlYn8RkqSDDfIJ1aeAc6rqySQnAF9N8q/dtvdV1Q2H9H89sLp7vAq4qnuWnpVWbbxpLMfdfcUFYzmu2tD3zL1mPNmtntA9jvbnm9YCn+z2ux04Kcmy4UuVJA1qoDn3JMcl2QbsB26pqju6TZd3Uy9XJjmxa1sOPNyz+56uTZI0IgOFe1UdqKo1wArgzCS/AVwGvAz4beAU4C/ncuAkG5JMJZmanp6eY9mSpKOZ09UyVfU4cBtwXlXt66ZengL+ETiz67YXWNmz24qu7dDX2lRVk1U1OTEx6x0rJUnzNMjVMhNJTuqWnwe8FvjWM/PoSQJcCNzb7bIVeEt31cxZwBNVtW9RqpckzWqQq2WWAZuTHMfMD4MtVfXFJF9OMgEE2Ab8adf/ZuB8YBfwE+BtC1+2JOlo+oZ7VW0Hzpil/Zwj9C/g0uFLkyTNl59QlaQGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWpQ33BP8twkdya5J8mOJB/q2k9LckeSXUk+m+Q5XfuJ3fqubvuqxf0SJEmHGuTM/SngnKp6BbAGOC/JWcCHgSur6teAx4BLuv6XAI917Vd2/SRJI9Q33GvGk93qCd2jgHOAG7r2zcCF3fLabp1u+7lJsmAVS5L6GmjOPclxSbYB+4FbgO8Cj1fV012XPcDybnk58DBAt/0J4IWzvOaGJFNJpqanp4f7KiRJBxko3KvqQFWtAVYAZwIvG/bAVbWpqiaranJiYmLYl5Mk9ZjT1TJV9ThwG/A7wElJju82rQD2dst7gZUA3fZfBh5dkGolSQMZ5GqZiSQndcvPA14L3MdMyP9h1209cGO3vLVbp9v+5aqqhSxaknR0x/fvwjJgc5LjmPlhsKWqvphkJ3B9kr8B7gau7vpfDXwqyS7gh8C6RahbknQUfcO9qrYDZ8zS/gAz8++Htv8P8EcLUp0kaV78hKokNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhrUN9yTrExyW5KdSXYkeXfX/sEke5Ns6x7n9+xzWZJdSe5P8rrF/AIkSYc7foA+TwPvrapvJHkBcFeSW7ptV1bV3/Z2TnI6sA54OfArwH8keWlVHVjIwiVJR9b3zL2q9lXVN7rlHwP3AcuPssta4PqqeqqqHgR2AWcuRLGSpMHMac49ySrgDOCOrumdSbYnuSbJyV3bcuDhnt32MMsPgyQbkkwlmZqenp5z4ZKkIxs43JM8H/gc8J6q+hFwFfASYA2wD/jIXA5cVZuqarKqJicmJuayqySpj4HCPckJzAT7p6vq8wBV9UhVHaiqnwKf4GdTL3uBlT27r+jaJEkjMsjVMgGuBu6rqo/2tC/r6fZG4N5ueSuwLsmJSU4DVgN3LlzJkqR+Brla5tXAm4FvJtnWtb0fuDjJGqCA3cA7AKpqR5ItwE5mrrS51CtlJGm0+oZ7VX0VyCybbj7KPpcDlw9RlyRpCH5CVZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkho0yJ/ZkzQGqzbeNJbj7r7igrEcVwvLM3dJapDhLkkN6hvuSVYmuS3JziQ7kry7az8lyS1JvtM9n9y1J8nHkuxKsj3JKxf7i5AkHWyQM/engfdW1enAWcClSU4HNgK3VtVq4NZuHeD1wOrusQG4asGrliQdVd9wr6p9VfWNbvnHwH3AcmAtsLnrthm4sFteC3yyZtwOnJRk2YJXLkk6ojnNuSdZBZwB3AEsrap93abvA0u75eXAwz277enaDn2tDUmmkkxNT0/PsWxJ0tEMHO5Jng98DnhPVf2od1tVFVBzOXBVbaqqyaqanJiYmMuukqQ+Bgr3JCcwE+yfrqrPd82PPDPd0j3v79r3Ait7dl/RtUmSRmSQq2UCXA3cV1Uf7dm0FVjfLa8Hbuxpf0t31cxZwBM90zeSpBEY5BOqrwbeDHwzybau7f3AFcCWJJcADwEXddtuBs4HdgE/Ad62oBVLkvrqG+5V9VUgR9h87iz9C7h0yLokSUPwE6qS1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWpQ33BPck2S/Unu7Wn7YJK9SbZ1j/N7tl2WZFeS+5O8brEKlyQd2SBn7tcC583SfmVVrekeNwMkOR1YB7y82+fvkxy3UMVKkgbTN9yr6ivADwd8vbXA9VX1VFU9COwCzhyiPknSPAwz5/7OJNu7aZuTu7blwMM9ffZ0bYdJsiHJVJKp6enpIcqQJB1qvuF+FfASYA2wD/jIXF+gqjZV1WRVTU5MTMyzDEnSbOYV7lX1SFUdqKqfAp/gZ1Mve4GVPV1XdG2SpBGaV7gnWdaz+kbgmStptgLrkpyY5DRgNXDncCVKkubq+H4dklwHnA0sSbIH+ABwdpI1QAG7gXcAVNWOJFuAncDTwKVVdWBxSpckHUnfcK+qi2dpvvoo/S8HLh+mKEnScPyEqiQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBfT/EJB0LVm28adwlSM8qnrlLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBnkppKSDjPOy091XXDC2Y7fGM3dJapDhLkkNMtwlqUGGuyQ1qG+4J7kmyf4k9/a0nZLkliTf6Z5P7tqT5GNJdiXZnuSVi1m8JGl2g5y5Xwucd0jbRuDWqloN3NqtA7weWN09NgBXLUyZkqS56BvuVfUV4IeHNK8FNnfLm4ELe9o/WTNuB05KsmyhipUkDWa+c+5Lq2pft/x9YGm3vBx4uKffnq7tMEk2JJlKMjU9PT3PMiRJsxn6F6pVVUDNY79NVTVZVZMTExPDliFJ6jHfcH/kmemW7nl/174XWNnTb0XXJkkaofmG+1Zgfbe8Hrixp/0t3VUzZwFP9EzfSJJGpO+9ZZJcB5wNLEmyB/gAcAWwJcklwEPARV33m4HzgV3AT4C3LULNkqQ++oZ7VV18hE3nztK3gEuHLUqSNBw/oSpJDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KD+t7PXZJGZdXGm8Zy3N1XXDCW4y4mz9wlqUGeuWtOxnVmJWluPHOXpAYZ7pLUoKGmZZLsBn4MHACerqrJJKcAnwVWAbuBi6rqseHKlCTNxUKcuf9eVa2pqslufSNwa1WtBm7t1iVJI7QY0zJrgc3d8mbgwkU4hiTpKIYN9wL+PcldSTZ0bUural+3/H1g6ZDHkCTN0bCXQv5uVe1N8iLgliTf6t1YVZWkZtux+2GwAeDUU08dsgxJUq+hztyram/3vB/4AnAm8EiSZQDd8/4j7LupqiaranJiYmKYMiRJh5h3uCf5xSQveGYZ+H3gXmArsL7rth64cdgiJUlzM8y0zFLgC0meeZ3PVNW/Jfk6sCXJJcBDwEXDlylJmot5h3tVPQC8Ypb2R4FzhylKkjQcP6EqSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGuRfYnoW8q8hSerHM3dJapDhLkkNMtwlqUHOuUv6uTfO32PtvuKCRXldz9wlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDfJDTEPwBl6SjlWeuUtSgxYt3JOcl+T+JLuSbFys40iSDrco0zJJjgM+DrwW2AN8PcnWqtq50MdyakSSDrdYZ+5nAruq6oGq+l/gemDtIh1LknSIxfqF6nLg4Z71PcCrejsk2QBs6FafTHL/ItUymyXAD0Z4vGcDx+RgjsfhHJODLch45MND7f7iI20Y29UyVbUJ2DSOYyeZqqrJcRz7WOWYHMzxOJxjcrBjfTwWa1pmL7CyZ31F1yZJGoHFCvevA6uTnJbkOcA6YOsiHUuSdIhFmZapqqeTvBP4EnAccE1V7ViMY83TWKaDjnGOycEcj8M5Jgc7pscjVTXuGiRJC8xPqEpSgwx3SWpQ0+He7xYISd6aZDrJtu7xJ+Ooc1QGuSVEkouS7EyyI8lnRl3jqA3wHrmy5/3x7SSPj6POURlgPE5NcluSu5NsT3L+OOocpQHG5MVJbu3G4z+TrBhHnYepqiYfzPwi97vArwLPAe4BTj+kz1uBvxt3rcfQeKwG7gZO7tZfNO66xz0mh/T/c2YuDhh77WN8j2wC/qxbPh3YPe66j4Ex+Sdgfbd8DvCpcdddVU2fuXsLhIMNMh5vBz5eVY8BVNX+Edc4anN9j1wMXDeSysZjkPEo4Je65V8GvjfC+sZhkDE5Hfhyt3zbLNvHouVwn+0WCMtn6fcH3X+nbkiycpbtrRhkPF4KvDTJfyW5Pcl5I6tuPAZ9j5DkxcBp/OybuEWDjMcHgT9Osge4mZn/zbRskDG5B3hTt/xG4AVJXjiC2o6q5XAfxL8Aq6rqN4FbgM1jrmfcjmdmauZsZs5SP5HkpLFWdOxYB9xQVQfGXciYXQxcW1UrgPOBTyX5ec+RvwBek+Ru4DXMfBp/7O+Tlv9R+t4CoaoeraqnutV/AH5rRLWNwyC3hNgDbK2q/6uqB4FvMxP2rZrLbTLW0faUDAw2HpcAWwCq6mvAc5m5gVarBsmR71XVm6rqDOCvurax/+K95XDvewuEJMt6Vt8A3DfC+kZtkFtC/DMzZ+0kWcLMNM0DoyxyxAa6TUaSlwEnA18bcX2jNsh4/DdwLkCSX2cm3KdHWuVoDZIjS3r+93IZcM2Ia5xVs+FeVU8Dz9wC4T5gS1XtSPLXSd7QdXtXd8nfPcC7mLl6pkkDjseXgEeT7GTmF0Pvq6pHx1Px4htwTGDmG/r66i6HaNWA4/Fe4O3d98x1wFtbHpcBx+Rs4P4k3waWApePpdhDePsBSWpQs2fukvTzzHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDfp/HpRrKaUmfoQAAAAASUVORK5CYII=\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}},{"output_type":"stream","text":["0.7384074074074075\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"pDlKu0JrXEAt"},"source":["**Discuss your results 1p**"]},{"cell_type":"markdown","metadata":{"id":"UFim2sjfXF_H"},"source":["It mostly performed with the accuracy around 70%.\n","When trained with more data it performed a little better on average.\n"]},{"cell_type":"markdown","metadata":{"id":"kDyquaYOl0Sa"},"source":["## Classification accuracy using leave-one-out cross validation"]},{"cell_type":"markdown","metadata":{"id":"_88lR86Kl0Sa"},"source":["Again, predict the **ship type** using **speed, destination, transformed length, and breadth** of the ship as features. Find an estimation for the classification accuracy using *leave-one-out cross validation (LOO CV)*. <br>\n","\n"," - Use leave-one-out cross validation to estimate the model performance **1p**\n","     - Use kNN classifier with k=3\n","     - What is the classification accuracy? Compare the result with the one you got in the previous task\n"," - Which method gives better evaluation of the performance of the classifier with this data set? Explain your choice **1p**"]},{"cell_type":"markdown","metadata":{"id":"C9WP3MZOx8fe"},"source":["**Use kNN classifier with k=3**"]},{"cell_type":"code","metadata":{"id":"Qs_hya_7c8w0"},"source":["from sklearn.model_selection import LeaveOneOut\n","\n","loo = LeaveOneOut()\n","\n","neigh = KNeighborsClassifier(n_neighbors=3)\n","\n","accuracy = np.zeros(loo.get_n_splits(X))\n","\n","for train_index, test_index in loo.split(X):\n","  X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]\n","  y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n","\n","  neigh.fit(X_train, np.ravel(y_train))\n","  \n","  accuracy[test_index] = neigh.score(X_test, np.ravel(y_test))"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"c3HONsWQyBHz"},"source":["**What is the classification accuracy? Compare the result with the one you got in the previous task**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"xEqI6C22yGV9","executionInfo":{"status":"ok","timestamp":1607102779029,"user_tz":-120,"elapsed":1161,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"d3c277da-2898-42e2-ec24-ebfc4c4241b4"},"source":["print(np.mean(accuracy))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["0.753731343283582\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"hh64ObxKioi-"},"source":["The accuracy is better with this cross-validation."]},{"cell_type":"markdown","metadata":{"id":"OEN34i4wyWGD"},"source":["**Which method gives better evaluation of the performance of the classifier with this data set? Explain your choice**"]},{"cell_type":"markdown","metadata":{"id":"CJLnbpmKym8t"},"source":["This gives better evaluation because we have more training data, and every datapoint gets to be part of the test."]},{"cell_type":"markdown","metadata":{"id":"BYt8MfHHl0Sa"},"source":["## Model selection with leave-one-out cross validation"]},{"cell_type":"markdown","metadata":{"id":"lQ5coDj8l0Sa"},"source":["- Select the best model (kNN with selection of k) using leave-one-out cross validation **2p**\n","    - Repeat the model performance estimation with values k=1...30\n","    - Which value of k produces the best classification accuracy?\n","    - If the number of k is still increased, what is the limit that the classification accuracy approaches? Why?\n","- Can you say something about the performance of this *selected* model with new, unseen data? Explain, how you could you estimate the performance of this selected model. **1p**"]},{"cell_type":"markdown","metadata":{"id":"u9GSR9Pbywet"},"source":["**Repeat the model performance estimation with values k=1...30**"]},{"cell_type":"code","metadata":{"id":"cNNzbRXIj5XC"},"source":["loo = LeaveOneOut()\n","\n","accuracy = np.zeros(loo.get_n_splits(X))\n","\n","means = np.zeros(30)\n","\n","for i in range(30):\n","\n","  neigh = KNeighborsClassifier(n_neighbors=i+1)\n","\n","  for train_index, test_index in loo.split(X):\n","    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]\n","    y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n","\n","    neigh.fit(X_train, np.ravel(y_train))\n","    \n","    accuracy[test_index] = neigh.score(X_test, np.ravel(y_test))\n","\n","  means[i]=np.mean(accuracy)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":282},"id":"2qtS_ThqmnrA","executionInfo":{"status":"ok","timestamp":1607101292900,"user_tz":-120,"elapsed":41189,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"175f2223-8bbc-42ed-a94f-8bb2d20f7d14"},"source":["plt.plot(np.array(range(30))+1, means)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[<matplotlib.lines.Line2D at 0x7feab4bcc2e8>]"]},"metadata":{"tags":[]},"execution_count":32},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"W3IOtXWFl0ML"},"source":["**Which value of k produces the best classification accuracy?**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"t9-kdB1il3n8","executionInfo":{"status":"ok","timestamp":1607101292901,"user_tz":-120,"elapsed":41182,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"40ccf92b-74d7-4553-eac5-b0062dac3d64"},"source":["print(np.argmax(means)+1)\n","print(np.max(means))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["4\n","0.7686567164179104\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"nm2QJEZzpjLR"},"source":["k = 4 with accuracy of 77%"]},{"cell_type":"markdown","metadata":{"id":"wdM68_-2nNZf"},"source":["**If the number of k is still increased, what is the limit that the classification accuracy approaches? Why?**"]},{"cell_type":"markdown","metadata":{"id":"XeHC-r_RnPIK"},"source":["It approaches the ratio of the most abundant value in the data. Because when k = n, it is comparing a new datapoint with all of the training data so it always classifies it as the most abundant one."]},{"cell_type":"markdown","metadata":{"id":"eEsqYbZMlo3l"},"source":["**Can you say something about the performance of this selected model with new, unseen data? Explain, how you could you estimate the performance of this selected model. 1p**"]},{"cell_type":"markdown","metadata":{"id":"OJWbq69OoLgG"},"source":["Leave-one-out is a great way to approximate the performance of the model with new data. So this guesses new data correctly approximately 77% of the time."]},{"cell_type":"markdown","metadata":{"id":"xAWiKYYtl0Sa"},"source":["## Testing with training data (this should not be used!)"]},{"cell_type":"markdown","metadata":{"id":"-0wZYT8_l0Sa"},"source":["- Repeat the previous task but use the whole data for training **2p**\n","    - Plot the resulting classification accuracy versus k=1...30. Include the values from the previous task in the same figure\n","    - Comment your result. Why shouldn't you test with training data?"]},{"cell_type":"markdown","metadata":{"id":"SbJMb59yzLOD"},"source":["**Plot the resulting classification accuracy versus k=1...30. Include the values from the previous task in the same figure**"]},{"cell_type":"code","metadata":{"id":"0s_kAEE3l0Sa"},"source":["loo = LeaveOneOut()\n","\n","accuracy = np.zeros(loo.get_n_splits(X))\n","\n","means = np.zeros(30)\n","\n","for i in range(30):\n","\n","  neigh = KNeighborsClassifier(n_neighbors=i+1)\n","\n","  for train_index, test_index in loo.split(X):\n","    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]\n","    y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n","\n","    neigh.fit(X, np.ravel(y))  # Lets use whole data to fit\n","    \n","    accuracy[test_index] = neigh.score(X_test, np.ravel(y_test))\n","\n","  means[i]=np.mean(accuracy)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":282},"id":"PBIWLBMFqZuG","executionInfo":{"status":"ok","timestamp":1607101307634,"user_tz":-120,"elapsed":55901,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"3ace75de-4174-4c0a-e5e9-b87249f51eeb"},"source":["plt.plot(np.array(range(30))+1, means)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[<matplotlib.lines.Line2D at 0x7feab4b3e240>]"]},"metadata":{"tags":[]},"execution_count":35},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"OiCG6rViqnhs"},"source":["**Comment your result. Why shouldn't you test with training data?**"]},{"cell_type":"markdown","metadata":{"id":"fRHI31UDqp_2"},"source":["One might interpret this as k = 1 being the best. But because the test data is also in the training data, one of the closest neighbours is always itself. That is what makes it biased. And thats why k=1 it gives 100% accuracy. "]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":732},"id":"ehg2UssczQkQ","executionInfo":{"status":"ok","timestamp":1607103352794,"user_tz":-120,"elapsed":13690,"user":{"displayName":"Elias Ervelä","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhcVQbqAobpSX3NE6w5d6aZPU_VzlnsvBC9GkyMtw=s64","userId":"11858975235946053692"}},"outputId":"058939ed-2979-4842-e906-2e601a88eb45"},"source":["!wget -nc https://raw.githubusercontent.com/brpy/colab-pdf/master/colab_pdf.py\n","from colab_pdf import colab_pdf\n","colab_pdf('DAKD2020_ex3_Elias_Ervela.ipynb', notebookpath = '/content/drive/My Drive/Colab Notebooks/DAKD/')"],"execution_count":null,"outputs":[{"output_type":"stream","text":["File ‘colab_pdf.py’ already there; not retrieving.\n","\n","Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease\n","Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease\n","Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease\n","Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release\n","Hit:5 http://security.ubuntu.com/ubuntu bionic-security InRelease\n","Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease\n","Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release\n","Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n","Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease\n","Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease\n","Hit:11 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n","Reading package lists... Done\n","Building dependency tree       \n","Reading state information... Done\n","68 packages can be upgraded. Run 'apt list --upgradable' to see them.\n","Reading package lists... Done\n","Building dependency tree       \n","Reading state information... Done\n","texlive-fonts-recommended is already the newest version (2017.20180305-1).\n","texlive-generic-recommended is already the newest version (2017.20180305-1).\n","texlive-xetex is already the newest version (2017.20180305-1).\n","0 upgraded, 0 newly installed, 0 to remove and 68 not upgraded.\n","[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/DAKD/DAKD2020_ex3_Elias_Ervela.ipynb to pdf\n","[NbConvertApp] Support files will be in DAKD2020_ex3_Elias_Ervela_files/\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Making directory ./DAKD2020_ex3_Elias_Ervela_files\n","[NbConvertApp] Writing 108970 bytes to ./notebook.tex\n","[NbConvertApp] Building PDF\n","[NbConvertApp] Running xelatex 3 times: [u'xelatex', u'./notebook.tex', '-quiet']\n","[NbConvertApp] Running bibtex 1 time: [u'bibtex', u'./notebook']\n","[NbConvertApp] WARNING | bibtex had problems, most likely because there were no citations\n","[NbConvertApp] PDF successfully created\n","[NbConvertApp] Writing 150344 bytes to /content/drive/My Drive/DAKD2020_ex3_Elias_Ervela.pdf\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/javascript":["\n","    async function download(id, filename, size) {\n","      if (!google.colab.kernel.accessAllowed) {\n","        return;\n","      }\n","      const div = document.createElement('div');\n","      const label = document.createElement('label');\n","      label.textContent = `Downloading \"${filename}\": `;\n","      div.appendChild(label);\n","      const progress = document.createElement('progress');\n","      progress.max = size;\n","      div.appendChild(progress);\n","      document.body.appendChild(div);\n","\n","      const buffers = [];\n","      let downloaded = 0;\n","\n","      const channel = await google.colab.kernel.comms.open(id);\n","      // Send a message to notify the kernel that we're ready.\n","      channel.send({})\n","\n","      for await (const message of channel.messages) {\n","        // Send a message to notify the kernel that we're ready.\n","        channel.send({})\n","        if (message.buffers) {\n","          for (const buffer of message.buffers) {\n","            buffers.push(buffer);\n","            downloaded += buffer.byteLength;\n","            progress.value = downloaded;\n","          }\n","        }\n","      }\n","      const blob = new Blob(buffers, {type: 'application/binary'});\n","      const a = document.createElement('a');\n","      a.href = window.URL.createObjectURL(blob);\n","      a.download = filename;\n","      div.appendChild(a);\n","      a.click();\n","      div.remove();\n","    }\n","  "],"text/plain":["<IPython.core.display.Javascript object>"]},"metadata":{"tags":[]}},{"output_type":"display_data","data":{"application/javascript":["download(\"download_44a7d4d4-3052-4e39-8884-f5c6e17f4abd\", \"DAKD2020_ex3_Elias_Ervela.pdf\", 150344)"],"text/plain":["<IPython.core.display.Javascript object>"]},"metadata":{"tags":[]}},{"output_type":"execute_result","data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'File ready to be Downloaded and Saved to Drive'"]},"metadata":{"tags":[]},"execution_count":46}]}]}
\ No newline at end of file