@ -0,0 +1,10 @@
|
||||
nan -9.480000000000000426e+00 1.415000000000000036e+01 1.126999999999999957e+01 -5.650000000000000355e+00 3.330000000000000071e+00 1.094999999999999929e+01 -2.149999999999999911e+00 5.339999999999999858e+00 -2.830000000000000071e+00 |
||||
9.480000000000000426e+00 nan 4.860000000000000320e+00 -8.609999999999999432e+00 7.820000000000000284e+00 -1.128999999999999915e+01 1.324000000000000021e+01 4.919999999999999929e+00 2.859999999999999876e+00 9.039999999999999147e+00 |
||||
-1.415000000000000036e+01 -1.126999999999999957e+01 nan 1.227999999999999936e+01 -2.410000000000000142e+00 6.040000000000000036e+00 -5.160000000000000142e+00 -3.870000000000000107e+00 -1.281000000000000050e+01 1.790000000000000036e+00 |
||||
5.650000000000000355e+00 -3.330000000000000071e+00 -1.094999999999999929e+01 nan -1.364000000000000057e+01 0.000000000000000000e+00 2.240000000000000213e+00 -3.609999999999999876e+00 -7.730000000000000426e+00 8.000000000000000167e-02 |
||||
2.149999999999999911e+00 -5.339999999999999858e+00 2.830000000000000071e+00 -4.860000000000000320e+00 nan -8.800000000000000044e-01 -8.570000000000000284e+00 2.560000000000000053e+00 -7.030000000000000249e+00 -6.330000000000000071e+00 |
||||
8.609999999999999432e+00 -7.820000000000000284e+00 1.128999999999999915e+01 -1.324000000000000021e+01 -4.919999999999999929e+00 nan -1.296000000000000085e+01 -1.282000000000000028e+01 -1.403999999999999915e+01 1.456000000000000050e+01 |
||||
-2.859999999999999876e+00 -9.039999999999999147e+00 -1.227999999999999936e+01 2.410000000000000142e+00 -6.040000000000000036e+00 5.160000000000000142e+00 nan -1.091000000000000014e+01 -1.443999999999999950e+01 -1.372000000000000064e+01 |
||||
3.870000000000000107e+00 1.281000000000000050e+01 -1.790000000000000036e+00 1.364000000000000057e+01 -0.000000000000000000e+00 -2.240000000000000213e+00 3.609999999999999876e+00 nan 1.053999999999999915e+01 -1.417999999999999972e+01 |
||||
7.730000000000000426e+00 -8.000000000000000167e-02 8.800000000000000044e-01 8.570000000000000284e+00 -2.560000000000000053e+00 7.030000000000000249e+00 6.330000000000000071e+00 1.296000000000000085e+01 nan -1.169999999999999929e+01 |
||||
1.282000000000000028e+01 1.403999999999999915e+01 -1.456000000000000050e+01 1.091000000000000014e+01 1.443999999999999950e+01 1.372000000000000064e+01 -1.053999999999999915e+01 1.417999999999999972e+01 1.169999999999999929e+01 nan |
@ -0,0 +1 @@
|
||||
Empty file. The original is too big to be pushed on Github. |
|
@ -0,0 +1,152 @@
|
||||
sepal_length,sepal_width,petal_length,petal_width, flower |
||||
5.1,3.5,1.4,0.2,Iris-setosa |
||||
4.9,3.0,1.4,0.2,Iris-setosa |
||||
4.7,3.2,1.3,0.2,Iris-setosa |
||||
4.6,3.1,1.5,0.2,Iris-setosa |
||||
5.0,-3.6,-1.4,0.2,Iris-setosa |
||||
5.4,3.9,1.7,0.4,Iris-setosa |
||||
4.6,3.4,1.4,0.3,Iris-setosa |
||||
5.0,3.4,1.5,0.2,Iris-setosa |
||||
-4.4,2.9,1400,0.2,Iris-setosa |
||||
4.9,3.1,1.5,0.1,Iris-setosa |
||||
5.4,3.7,1.5,0.2,Iris-setosa |
||||
4.8,3.4,1.6,0.2,Iris-setosa |
||||
4.8,3.0,1.4,0.1,Iris-setosa |
||||
4.3,3.0,1.1,0.1,Iris-setosa |
||||
5.8,4.0,1.2,0.2,Iris-setosa |
||||
5.7,4.4,1500,0.4,Iris-setosa |
||||
5.4,3.9,1.3,0.4,Iris-setosa |
||||
5.1,3.5,1.4,0.3,Iris-setosa |
||||
5.7,3.8,1.7,0.3,Iris-setosa |
||||
5.1,3.8,1.5,0.3,Iris-setosa |
||||
5.4,3.4,-1.7,0.2,Iris-setosa |
||||
5.1,3.7,1.5,0.4,Iris-setosa |
||||
4.6,3.6,1.0,0.2,Iris-setosa |
||||
5.1,3.3,1.7,0.5,Iris-setosa |
||||
4.8,3.4,1.9,0.2,Iris-setosa |
||||
5.0,-3.0,1.6,0.2,Iris-setosa |
||||
5.0,3.4,1.6,0.4,Iris-setosa |
||||
5.2,3.5,1.5,0.2,Iris-setosa |
||||
5.2,3.4,1.4,0.2,Iris-setosa |
||||
4.7,3.2,1.6,0.2,Iris-setosa |
||||
4.8,3.1,1.6,0.2,Iris-setosa |
||||
5.4,3.4,1.5,0.4,Iris-setosa |
||||
5.2,4.1,1.5,0.1,Iris-setosa |
||||
5.5,4.2,1.4,0.2,Iris-setosa |
||||
4.9,3.1,1.5,0.1,Iris-setosa |
||||
5.0,3.2,1.2,0.2,Iris-setosa |
||||
5.5,3.5,1.3,0.2,Iris-setosa |
||||
4.9,,1.5,0.1,Iris-setosa |
||||
4.4,3.0,1.3,0.2,Iris-setosa |
||||
5.1,3.4,1.5,0.2,Iris-setosa |
||||
5.0,"3.5",1.3,0.3,Iris-setosa |
||||
4.5,2.3,1.3,0.3,Iris-setosa |
||||
4.4,3.2,1.3,0.2,Iris-setosa |
||||
5.0,3.5,1.6,0.6,Iris-setosa |
||||
5.1,3.8,1.9,0.4,Iris-setosa |
||||
4.8,3.0,1.4,0.3,Iris-setosa |
||||
5.1,3809,1.6,0.2,Iris-setosa |
||||
4.6,3.2,1.4,0.2,Iris-setosa |
||||
5.3,3.7,1.5,0.2,Iris-setosa |
||||
5.0,3.3,1.4,0.2,Iris-setosa |
||||
7.0,3.2,4.7,1.4,Iris-versicolor |
||||
6.4,3200,4.5,1.5,Iris-versicolor |
||||
6.9,3.1,4.9,1.5,Iris-versicolor |
||||
5.5,2.3,4.0,1.3,Iris-versicolor |
||||
6.5,2.8,4.6,1.5,Iris-versicolor |
||||
5.7,2.8,4.5,1.3,Iris-versicolor |
||||
6.3,3.3,4.7,1600,Iris-versicolor |
||||
4.9,2.4,3.3,1.0,Iris-versicolor |
||||
6.6,2.9,4.6,1.3,Iris-versicolor |
||||
5.2,2.7,3.9,,Iris-versicolor |
||||
5.0,2.0,3.5,1.0,Iris-versicolor |
||||
5.9,3.0,4.2,1.5,Iris-versicolor |
||||
6.0,2.2,4.0,1.0,Iris-versicolor |
||||
6.1,2.9,4.7,1.4,Iris-versicolor |
||||
5.6,2.9,3.6,1.3,Iris-versicolor |
||||
6.7,3.1,4.4,1.4,Iris-versicolor |
||||
5.6,3.0,4.5,1.5,Iris-versicolor |
||||
5.8,2.7,4.1,1.0,Iris-versicolor |
||||
6.2,2.2,4.5,1.5,Iris-versicolor |
||||
5.6,2.5,3.9,1.1,Iris-versicolor |
||||
5.9,3.2,4.8,1.8,Iris-versicolor |
||||
6.1,2.8,4.0,1.3,Iris-versicolor |
||||
6.3,2.5,4.9,1.5,Iris-versicolor |
||||
6.1,2.8,4.7,1.2,Iris-versicolor |
||||
6.4,2.9,4.3,1.3,Iris-versicolor |
||||
6.6,3.0,4.4,1.4,Iris-versicolor |
||||
6.8,2.8,4.8,1.4,Iris-versicolor |
||||
6.7,3.0,5.0,1.7,Iris-versicolor |
||||
6.0,2.9,4.5,1.5,Iris-versicolor |
||||
5.7,2.6,3.5,1.0,Iris-versicolor |
||||
5.5,2.4,3.8,1.1,Iris-versicolor |
||||
5.5,2.4,3.7,1.0,Iris-versicolor |
||||
5.8,2.7,3.9,1.2,Iris-versicolor |
||||
6.0,2.7,5.1,1.6,Iris-versicolor |
||||
5.4,3.0,4.5,1.5,Iris-versicolor |
||||
6.0,3.4,4.5,1.6,Iris-versicolor |
||||
6.7,3.1,4.7,1.5,Iris-versicolor |
||||
6.3,2.3,4.4,1.3,Iris-versicolor |
||||
5.6,3.0,4.1,1.3,Iris-versicolor |
||||
5.5,2.5,4.0,1.3,Iris-versicolor |
||||
5.5,2.6,4.4,1.2,Iris-versicolor |
||||
6.1,3.0,4.6,1.4,Iris-versicolor |
||||
5.8,2.6,4.0,1.2,Iris-versicolor |
||||
5.0,2.3,3.3,1.0,Iris-versicolor |
||||
5.6,2.7,4.2,1.3,Iris-versicolor |
||||
5.7,3.0,4.2,1.2,Iris-versicolor |
||||
5.7,2.9,4.2,1.3,Iris-versicolor |
||||
6.2,2.9,4.3,1.3,Iris-versicolor |
||||
5.1,2.5,3.0,1.1,Iris-versicolor |
||||
5.7,2.8,4.1,1.3,Iris-versicolor |
||||
6.3,3.3,6.0,2.5,Iris-virginica |
||||
5.8,2.7,5.1,1.9,Iris-virginica |
||||
7.1,3.0,5.9,2.1,Iris-virginica |
||||
6.3,2.9,5.6,1.8,Iris-virginica |
||||
6.5,3.0,5.8,2.2,Iris-virginica |
||||
7.6,3.0,6.6,2.1,Iris-virginica |
||||
4.9,2.5,4.5,1.7,Iris-virginica |
||||
7.3,2.9,6.3,1.8,Iris-virginica |
||||
6.7,2.5,5.8,1.8,Iris-virginica |
||||
7.2,3.6,6.1,2.5,Iris-virginica |
||||
6.5,3.2,5.1,2.0,Iris-virginica |
||||
6.4,2.7,5.3,1.9,Iris-virginica |
||||
6.8,3.0,5.5,2.1,Iris-virginica |
||||
5.7,2.5,5.0,2.0,Iris-virginica |
||||
5.8,2.8,5.1,2.4,Iris-virginica |
||||
6.4,3.2,5.3,2.3,Iris-virginica |
||||
6.5,3.0,5.5,1.8,Iris-virginica |
||||
7.7,3.8,6.7,2.2,Iris-virginica |
||||
7.7,2.6,6.9,2.3,Iris-virginica |
||||
6.0,2.2,5.0,1.5,Iris-virginica |
||||
6.9,3.2,5.7,2.3,Iris-virginica |
||||
5.6,2.8,4.9,2.0,Iris-virginica |
||||
7.7,2.8,6.7,2.0,Iris-virginica |
||||
6.3,2.7,4.9,1.8,Iris-virginica |
||||
6.7,3.3,5.7,2.1,Iris-virginica |
||||
7.2,3.2,6.0,1.8,Iris-virginica |
||||
6.2,2.8,-4.8,1.8,Iris-virginica |
||||
6.1,3.0,4.9,1.8,Iris-virginica |
||||
6.4,2.8,5.6,2.1,Iris-virginica |
||||
7.2,3.0,5.8,1.6,Iris-virginica |
||||
7.4,2.8,6.1,1.9,Iris-virginica |
||||
7.9,3.8,6.4,2.0,Iris-virginica |
||||
6.-4,2.8,5.6,2.2,Iris-virginica |
||||
6.3,2.8,"5.1",1.5,Iris-virginica |
||||
6.1,2.6,5.6,1.4,Iris-virginica |
||||
7.7,3.0,6.1,2.3,Iris-virginica |
||||
6.3,3.4,5.6,2.4,Iris-virginica |
||||
6.4,3.1,5.5,1.8,Iris-virginica |
||||
6.0,3.0,4.8,1.8,Iris-virginica |
||||
6900,3.1,5.4,2.1,Iris-virginica |
||||
6.7,3.1,5.6,2.4,Iris-virginica |
||||
6.9,3.1,5.1,2.3,Iris-virginica |
||||
580,2.7,5.1,1.9,Iris-virginica |
||||
6.8,3.2,5.9,2.3,Iris-virginica |
||||
6.7,3.3,5.7,-2.5,Iris-virginica |
||||
6.7,3.0,5.2,2.3,Iris-virginica |
||||
6.3,2.5,5.0,1.9,Iris-virginica |
||||
6.5,3.0,5.2,2.0,Iris-virginica |
||||
6.2,3.4,5.4,2.3,Iris-virginica |
||||
5.9,3.0,5.1,1.8,Iris-virginica |
||||
|
@ -0,0 +1,415 @@
|
||||
# D01 Piscine AI - Data Science |
||||
|
||||
The goal of this day is to understand practical usage of **NumPy**. **NumPy** is a commonly used Python data analysis package. By using **NumPy**, you can speed up your workflow, and interface with other packages in the Python ecosystem, like scikit-learn, that use **NumPy** under the hood. **NumPy** was originally developed in the mid 2000s, and arose from an even older package called Numeric. This longevity means that almost every data analysis or machine learning package for Python leverages **NumPy** in some way. |
||||
|
||||
Version of NumPy I used to do the exercices: 1.18.1 |
||||
I suggest to use the most recent one. |
||||
Author: |
||||
|
||||
<div style="page-break-after: always"></div> |
||||
|
||||
# Outline: |
||||
A. Introduction |
||||
|
||||
B. Rules |
||||
C. Exercices |
||||
|
||||
1. |
||||
2. |
||||
3. |
||||
4. |
||||
5. |
||||
6. |
||||
7. |
||||
8. |
||||
9. |
||||
|
||||
## Rules |
||||
... Notebook Colabs or Jupyter Notebook |
||||
Save one notebook per day or one per exercice. Use markdown to divide your notebook in different exercices. |
||||
## Ressources |
||||
|
||||
- https://medium.com/fintechexplained/why-should-we-use-NumPy-c14a4fb03ee9 |
||||
- https://docs.scipy.org/doc/NumPy-1.15.0/reference/ |
||||
- https://jakevdp.github.io/PythonDataScienceHandbook/ |
||||
|
||||
# Exercice 1 Your first NumPy array |
||||
|
||||
The goal of this exercice is to use many Python data types in **NumPy** arrays. **NumPy** arrays are intensively used in **NumPy** and **Pandas**. They are flexible and allow to use optimized **NumPy** underlying functions. |
||||
|
||||
1. Create a NumPy array that contains: an integer, a float, a string, a dictionary, a list, a tuple, a set and a boolean. |
||||
|
||||
The expected output is: |
||||
|
||||
``` |
||||
for i in your_np_array: |
||||
print(type(i)) |
||||
|
||||
<class 'int'> |
||||
<class 'float'> |
||||
<class 'str'> |
||||
<class 'dict'> |
||||
<class 'list'> |
||||
<class 'tuple'> |
||||
<class 'set'> |
||||
<class 'bool'> |
||||
|
||||
``` |
||||
## Correction |
||||
|
||||
1. This question is validated if the your_numpy_array is a NumPy array. It can be checked with `type(your_numpy_array)` that should be equal to `numpy.ndarray`. And if the type of is element are as follow. |
||||
|
||||
``` |
||||
for i in your_np_array: |
||||
print(type(i)) |
||||
|
||||
<class 'int'> |
||||
<class 'float'> |
||||
<class 'str'> |
||||
<class 'dict'> |
||||
<class 'list'> |
||||
<class 'tuple'> |
||||
<class 'set'> |
||||
<class 'bool'> |
||||
|
||||
``` |
||||
--- |
||||
# Exercice 2 Zeros |
||||
|
||||
The goal of this exercice is to learn to create a NumPy array with 0s. |
||||
|
||||
1. Create a NumPy array of dimension **300** with zeros without filling it manually |
||||
2. Reshape it to **(3,100)** |
||||
|
||||
## Correction |
||||
|
||||
1. The question is validated is the solution uses `np.zeros` and if the shape of the array is `(300,)` |
||||
|
||||
2. The question is validated if the solution uses `reshape` and the shape of the array is **(3, 100)** |
||||
--- |
||||
## Exercice 3 Slicing |
||||
|
||||
The goal of this exercice is to learn NumPy indexing/slicing. It allows to access values of the NumPy array efficiently and without a for loop. |
||||
|
||||
1. Create a NumPy array of dimension 1 that contains all integers from 1 to 100 ordered. |
||||
2. Without using a for loop and using the array created in Q1, create an array that contain all odd integers. The expected output is: `np.array([1,3,...,99])`. *Hint*: it takes one line |
||||
3. Without using a for loop and using the array created in Q1, create an array that contain all even integers reversed. The expected output is: `np.array([100,98,...,2])`. *Hint*: it takes one line |
||||
|
||||
4. Using array of Q1, set the value of every 3 elements of the list (starting with the second) to 0. The expected output is: `np.array([[1,0,3,4,0,...,0,99,100]])` |
||||
|
||||
## Correction |
||||
|
||||
|
||||
1. This question is validated if the solution doesn't involve a for loop or writing all integers from 1 to 100 and if the array is: `np.array([1,...,100])`. The list from 1 to 100 can be generated with an iterator: `range`. |
||||
|
||||
2. This question is validated if the solution is: `integers[1::2]` |
||||
|
||||
3. This question is validated if the solution is: `integers[::-2]` |
||||
|
||||
4. This question is validated if the array is: `np.array([[1,0,3,4,0,...,0,99,100]])`. There are at least two ways to get this results without for loop. The first one uses `integers[1::3] = 0` and the second involves creating a boolean array that indexes the array: |
||||
|
||||
``` |
||||
mask = (integers+1)%3 == 0 |
||||
integers[mask] = 0 |
||||
``` |
||||
--- |
||||
# Exercice 4 Random |
||||
|
||||
The goal of this exercice is to learn to generate random data. |
||||
In Data Science it is extremely useful to generate random data for many reasons: |
||||
Lack of real data, create a random benchmark, use varied data sets. |
||||
NumPy proposes a lot of options to generate random data. In statistics, assumptions are made on the distribution the data is from. All data distribution that can be generated randomly are described in the documentation. In this exerice we will focus on two distributions: |
||||
|
||||
- Uniform: For example, if your goal is to generate a random number from 1 to 100 and that the probability that all the numbers is equal you'll need the uniform distribution. NumPy provides `randint` and `uniform` to generate uniform distribution |
||||
- Normal: The normal distribution is the most important probability distribution in statistics because it fits many natural phenomena.For example, if you need to generate a data sample that represents **Heights of 14 Year Old Girls** it can be done using the normal distribution. In that case, we need two parameters: the mean (1m51) and the standard deviation (0.0741m). NumPy provides `randn` to generate normal distribution (among other) |
||||
https://docs.scipy.org/doc/NumPy-1.15.0/reference/routines.random.html |
||||
|
||||
1. Set the seed to 888 |
||||
2. Generate a **one-dimensional** array of size 100 with a normal distribution |
||||
3. Generate a **two-dimensional** array of size 8,8 with random integers from 1 to 10 - both included (same probability for each integer) |
||||
4. Generate a **three-dimensional** of size 4,2,5 array with random integers from 1 to 17 - both included (same probability for each integer) |
||||
|
||||
|
||||
## Correction: |
||||
For this exercice, as the results may change depending on the version of the package or the OS, I give the code to correct the exercice. If the code is correct and the output is not the same as mine, it is accepted. |
||||
|
||||
1. The solution is accepted if the solution is: `np.random.seed(888)` |
||||
|
||||
2. The solution is accepted if the solution is `np.random.randn(100)`. The value of the first element is `0.17620087373662233`. |
||||
|
||||
3. The solution is accepted if the solution is `np.random.randint(1,11,(8,8))`. |
||||
|
||||
``` |
||||
Given the NumPy version and the seed, you should have this output: |
||||
|
||||
array([[ 7, 4, 8, 10, 2, 1, 1, 10], |
||||
[ 4, 1, 7, 4, 3, 5, 2, 8], |
||||
[ 3, 9, 7, 4, 9, 6, 10, 5], |
||||
[ 7, 10, 3, 10, 2, 1, 3, 7], |
||||
[ 3, 2, 3, 2, 10, 9, 5, 4], |
||||
[ 4, 1, 9, 7, 1, 4, 3, 5], |
||||
[ 3, 2, 10, 8, 6, 3, 9, 4], |
||||
[ 4, 4, 9, 2, 8, 5, 9, 5]]) |
||||
``` |
||||
4. The solution is accepted if the solution is `np.random.randint(1,18,(4,2,5))`. |
||||
|
||||
``` |
||||
Given the NumPy version and the seed, you should have this output: |
||||
|
||||
array([[[14, 16, 8, 15, 14], |
||||
[17, 13, 1, 4, 17]], |
||||
|
||||
[[ 7, 15, 2, 8, 3], |
||||
[ 9, 4, 13, 9, 15]], |
||||
|
||||
[[ 5, 11, 11, 14, 10], |
||||
[ 2, 1, 15, 3, 3]], |
||||
|
||||
[[ 3, 10, 5, 16, 13], |
||||
[17, 12, 9, 7, 16]]]) |
||||
``` |
||||
--- |
||||
# Exercice 5: Split, contenate, reshape arrays |
||||
|
||||
The goal of this exercice is to learn to concatenate and reshape arrays. |
||||
|
||||
1. Generate an array with integers from 1 to 50: `array([1,...,50])` |
||||
2. Generate an array with integers from 51 to 100: `array([51,...,100])` |
||||
|
||||
3. Using `np.concatenate`, concatenate the two arrays into: `array([1,...,100])` |
||||
4. Reshape the previous array into: |
||||
|
||||
``` |
||||
array([[ 1, ... , 10], |
||||
... |
||||
[ 91, ... , 100]]) |
||||
``` |
||||
|
||||
## Correction: |
||||
|
||||
1. This question is validated if the generated array is based on an iterator as `range` or `np.arange`. Check that 50 is part of the array. |
||||
|
||||
2. This question is validated if the generated array is based on an iterator as `range` or `np.arange`. Check that 100 is part of the array. |
||||
|
||||
3. This question is validated if you concatenated this way `np.concatenate(array1,array2)`. |
||||
|
||||
4. This question is validated if the result is: |
||||
|
||||
``` |
||||
array([[ 1, ... , 10], |
||||
... |
||||
[ 91, ... , 100]]) |
||||
``` |
||||
The easiest way is to use `array.reshape(10,10)`. |
||||
|
||||
https://jakevdp.github.io/PythonDataScienceHandbook/02.02-the-basics-of-NumPy-arrays.html |
||||
|
||||
--- |
||||
# Exercice 6: Broadcasting and Slicing |
||||
|
||||
The goal of this exercice is to learn to access values of n-dimensional arrays and efficiently. |
||||
|
||||
1. Create an 2-dimensional array size 9,9 of 1s. Each value has to be an `int8`. |
||||
2. Using **slicing**, output this array: |
||||
|
||||
``` |
||||
array([[1, 1, 1, 1, 1, 1, 1, 1, 1], |
||||
[1, 0, 0, 0, 0, 0, 0, 0, 1], |
||||
[1, 0, 1, 1, 1, 1, 1, 0, 1], |
||||
[1, 0, 1, 0, 0, 0, 1, 0, 1], |
||||
[1, 0, 1, 0, 1, 0, 1, 0, 1], |
||||
[1, 0, 1, 0, 0, 0, 1, 0, 1], |
||||
[1, 0, 1, 1, 1, 1, 1, 0, 1], |
||||
[1, 0, 0, 0, 0, 0, 0, 0, 1], |
||||
[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int8) |
||||
``` |
||||
|
||||
https://jakevdp.github.io/PythonDataScienceHandbook/02.05-computation-on-arrays-broadcasting.html |
||||
|
||||
## Correction |
||||
|
||||
1. The question is validated if the output is the same as: |
||||
`np.ones([9,9], dtype=np.int8)` |
||||
|
||||
2. The question is validated if the ouput is |
||||
|
||||
``` |
||||
array([[1, 1, 1, 1, 1, 1, 1, 1, 1], |
||||
[1, 0, 0, 0, 0, 0, 0, 0, 1], |
||||
[1, 0, 1, 1, 1, 1, 1, 0, 1], |
||||
[1, 0, 1, 0, 0, 0, 1, 0, 1], |
||||
[1, 0, 1, 0, 1, 0, 1, 0, 1], |
||||
[1, 0, 1, 0, 0, 0, 1, 0, 1], |
||||
[1, 0, 1, 1, 1, 1, 1, 0, 1], |
||||
[1, 0, 0, 0, 0, 0, 0, 0, 1], |
||||
[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int8) |
||||
``` |
||||
The solution is not accepted if the values of the array have been changed one by one manually. The usage of the for loop is not allowed neither. |
||||
Here is an example of solution: |
||||
|
||||
``` |
||||
x[1:8,1:8] = 0 |
||||
x[2:7,2:7] = 1 |
||||
x[3:6,3:6] = 0 |
||||
x[4,4] = 1 |
||||
``` |
||||
--- |
||||
# Exercice 7: NaN |
||||
|
||||
The goal of this exercice is to learn to deal with missing data in NumPy and to manipulate NumPy arrays. |
||||
|
||||
Let us consider a 2-dimensional array that contains the grades at the past two exams. Some of the students missed the first exam. As the grade is missing it has been replaced with a NaN. |
||||
|
||||
1. Using `np.where` create a third column that is equal to the grade of the first exam if it exists and the second else. Add the column as the third column of the array. |
||||
|
||||
**Using a for loop or if/else statement is not allowed in this exercice.** |
||||
|
||||
``` |
||||
import numpy as np |
||||
|
||||
generator = np.random.default_rng(123) |
||||
grades = np.round(generator.uniform(low = 0.0, high = 10.0, size = (10, 2))) |
||||
grades[[1,2,5,7], [0,0,0,0]] = np.nan |
||||
print(grades) |
||||
|
||||
``` |
||||
|
||||
## Correction |
||||
|
||||
1. There are two steps in this exercice: |
||||
- Create the vector that contains the the grade of the first exam if available or the second. This can be done using `np.where`: |
||||
``` |
||||
np.where(np.isnan(grades[:, 0]), grades[:, 1], grades[:, 0]) |
||||
``` |
||||
- Add this vector as third column of the array. Here are two ways: |
||||
``` |
||||
np.insert(arr = grades, values = new_vector, axis = 1, obj = 2) |
||||
|
||||
np.hstack((grades, new_vector[:, None])) |
||||
``` |
||||
This question is validated if, without having used a for loop or having filled the array manually, the output is: |
||||
|
||||
``` |
||||
[[ 7. 1. 7.] |
||||
[nan 2. 2.] |
||||
[nan 8. 8.] |
||||
[ 9. 3. 9.] |
||||
[ 8. 9. 8.] |
||||
[nan 2. 2.] |
||||
[ 8. 2. 8.] |
||||
[nan 6. 6.] |
||||
[ 9. 2. 9.] |
||||
[ 8. 5. 8.]] |
||||
``` |
||||
|
||||
https://jakevdp.github.io/PythonDataScienceHandbook/02.02-the-basics-of-NumPy-arrays.html |
||||
|
||||
--- |
||||
# Exercice 8: Wine |
||||
|
||||
The goal of this exercice is to learn to perform a basic data analysis on real data using NumPy. |
||||
|
||||
The data set that will be used for this exercice is the wine data set. |
||||
https://archive.ics.uci.edu/ml/datasets/wine+quality |
||||
|
||||
|
||||
How to tell if a given 2D array has null columns? |
||||
|
||||
1. Using `genfromtxt` load the data and reduce the size of the numpy array by optimizing the types. The sum of absolute differences between the original data set and the "memory" optimized one has to be smaller than 1.10**-3. I suggest to use `np.float32`. Check that the numpy array weights **76800 bytes**. |
||||
|
||||
2. Print 2nd, 7th and 12th rows as a two dimensional array |
||||
3. Is there any wine with a percentage of alcohol greater than 20% ? Return True or False |
||||
|
||||
4. What is the average % of alcohol on all wines in the data set ? If needed, drop `np.nan` values |
||||
5. Compute the minimum, the maximum, the 25th percentile, the 75 percentile, the median of the pH |
||||
6. Compute the average quality of the wines having the 20% least sulphates |
||||
7. Compute the mean of all variables for wines having the best quality. Same question for the wines having the worst quality |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the text file has successfully been loaded in a NumPy array with |
||||
` genfromtxt('winequality-red.csv', delimiter=',')` and the reduced arrays weights **76800 bytes** |
||||
|
||||
2. This question is validated if the output is |
||||
|
||||
|
||||
``` |
||||
array([[ 7.4 , 0.7 , 0. , 1.9 , 0.076 , 11. , 34. , |
||||
0.9978, 3.51 , 0.56 , 9.4 , 5. ], |
||||
[ 7.4 , 0.66 , 0. , 1.8 , 0.075 , 13. , 40. , |
||||
0.9978, 3.51 , 0.56 , 9.4 , 5. ], |
||||
[ 6.7 , 0.58 , 0.08 , 1.8 , 0.097 , 15. , 65. , |
||||
0.9959, 3.28 , 0.54 , 9.2 , 5. ]]) |
||||
``` |
||||
This slicing gives the answer `my_data[[1,6,11],:]`. |
||||
|
||||
3. This question is validated if the answer if False. There many ways to get the answer: find the maximum or check values greater than 20. |
||||
|
||||
4. This question is validated if the answer is 10.422983114446529. |
||||
|
||||
5. This question is validated if the answers is: |
||||
|
||||
``` |
||||
pH stats |
||||
25 percentile: 3.21 |
||||
50 percentile: 3.31 |
||||
75 percentile: 3.4 |
||||
mean: 3.3111131957473416 |
||||
min: 2.74 |
||||
max: 4.01 |
||||
``` |
||||
*Note: Using `percentile` or `median` may give different results depending on the duplicate values in the column. If you do not have my results please use `percentile`.* |
||||
6. This question is validated if the answer is `5.222222222222222`. The first step is to get the percentile 20% of the column `sulphates`, then create a boolean array that contains `True` of the value is smaller than the percentile 20%, then select this rows with the column quality and compute the `mean`. |
||||
|
||||
7. This question is validated if the output for the best wines is: |
||||
|
||||
``` |
||||
array([ 8.56666667, 0.42333333, 0.39111111, 2.57777778, 0.06844444, |
||||
13.27777778, 33.44444444, 0.99521222, 3.26722222, 0.76777778, |
||||
12.09444444, 8. ]) |
||||
``` |
||||
|
||||
And the output for the bad wines is: |
||||
|
||||
|
||||
``` |
||||
array([ 8.36 , 0.8845 , 0.171 , 2.635 , 0.1225 , 11. , |
||||
24.9 , 0.997464, 3.398 , 0.57 , 9.955 , 3. ]) |
||||
``` |
||||
|
||||
This can be done in three steps: Get the max, create a boolean mask that indicates rows with max quality, use this mask to subset the rows with the best quality and compute the mean on the axis 0. |
||||
|
||||
--- |
||||
## Exercice 9 Football tournament |
||||
|
||||
The goal of this exercice is to learn to use permutations, complex |
||||
|
||||
A Football tournament is organized in your city. There are 10 teams and the director of the tournaments wants you to create a first round as exciting as possible. To do so, you are allowed to choose the pairs. As a former data scientist, you implemented a model based on teams' current season performance. This models predicts the score difference between two teams. You used this algorithm to predict the score difference for every possible pair. |
||||
The matrix returned is a 2-dimensional array that contains in (i,j) the score difference between team i and j. The matrix is in `model_forecasts.txt`. |
||||
|
||||
Using this output, what are the pairs that will give the most intersting matches ? |
||||
|
||||
If a team wins 7-1 the match is obviously less exciting than a match where the winner wins 2-1. |
||||
The criteria that correponds to **the pairs that will give the most intersting matches** is **the pairs that minimize the sum of squared differences** |
||||
|
||||
The expected output is: |
||||
|
||||
``` |
||||
[[m1_t1 m2_t1 m3_t1 m4_t1 m5_t1] |
||||
[m1_t2 m2_t2 m3_t2 m4_t2 m5_t2]] |
||||
|
||||
``` |
||||
- m1_t1 stands for match1_team1 |
||||
- m1_t1 plays against m1_t2 ... |
||||
|
||||
|
||||
**Usage of for loop is not allowed, you may need to use the library** `itertools` **to create permutations** |
||||
|
||||
## Correction |
||||
|
||||
This exercice is validated if the output is: |
||||
|
||||
``` |
||||
[[0 3 1 2 4] |
||||
[7 6 8 9 5]] |
||||
``` |
@ -0,0 +1,373 @@
|
||||
# D02 Piscine AI - Data Science |
||||
|
||||
|
||||
|
||||
Author: |
||||
|
||||
# Table of Contents: |
||||
Historical part: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
The goal of this day is to understand practical usage of Pandas. |
||||
As Pandas in intensively used in Data Science, other days of the piscine will be dedidated to it. |
||||
|
||||
Not only is the Pandas library a central component of the data science toolkit but it is used in conjunction with other libraries in that collection. |
||||
|
||||
Pandas is built on top of the NumPy package, meaning a lot of the structure of NumPy is used or replicated in Pandas. Data in pandas is often used to feed statistical analysis in SciPy, plotting functions from Matplotlib, and machine learning algorithms in Scikit-learn. |
||||
|
||||
Most of the topics we will cover today are explained and describes with examples in the first ressource. The number of exercices is low on purpose: Take the time to understand the chapter 5 of the ressource, even if there are 40 pages. |
||||
|
||||
The version of Pandas I used is '1.0.1'. |
||||
|
||||
## Rules |
||||
... |
||||
## Ressources |
||||
|
||||
- If I had to give you one ressource it would be this one: |
||||
|
||||
https://bedford-computing.co.uk/learning/wp-content/uploads/2015/10/Python-for-Data-Analysis.pdf |
||||
It contains ALL you need to know about Pandas. |
||||
|
||||
- Pandas documentation: |
||||
|
||||
https://pandas.pydata.org/docs/ |
||||
|
||||
|
||||
- https://jakevdp.github.io/PythonDataScienceHandbook/ |
||||
|
||||
https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf |
||||
https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/ |
||||
|
||||
https://jakevdp.github.io/PythonDataScienceHandbook/03.04-missing-values.html |
||||
# Exercice 1 |
||||
|
||||
The goal of this exercice is to learn to create basic Pandas objects. |
||||
|
||||
1. Create a DataFrame as below this using two ways: |
||||
- From a NumPy array |
||||
- From a Pandas Series |
||||
|
||||
| | color | list | number | |
||||
|---:|:--------|:--------|---------:| |
||||
| 1 | Blue | [1, 2] | 1.1 | |
||||
| 3 | Red | [3, 4] | 2.2 | |
||||
| 5 | Pink | [5, 6] | 3.3 | |
||||
| 7 | Grey | [7, 8] | 4.4 | |
||||
| 9 | Black | [9, 10] | 5.5 | |
||||
|
||||
|
||||
2. Print the types for every columns and the types of the first value of every columns |
||||
|
||||
## Solution |
||||
|
||||
1. The solution is accepted if the DataFrame created is the same as the "model" DataFrame. Check that the index is not 1,2,3,4,5. |
||||
|
||||
2. The solution is accepted if the types you get for the columns are |
||||
|
||||
``` |
||||
<class 'pandas.core.series.Series'> |
||||
<class 'pandas.core.series.Series'> |
||||
<class 'pandas.core.series.Series'> |
||||
|
||||
``` |
||||
|
||||
and if the types of the first value of the columns are |
||||
|
||||
``` |
||||
<class 'str'> |
||||
<class 'list'> |
||||
<class 'float'> |
||||
|
||||
``` |
||||
|
||||
# Exercice 2 **Electric power consumption** |
||||
|
||||
The goal of this exercice is to learn to manipulate real data with Pandas. |
||||
The data set used is **Individual household electric power consumption** |
||||
|
||||
1. Delete the columns `Time`, `Sub_metering_2` and `Sub_metering_3` |
||||
|
||||
2. Set `Date` as index |
||||
|
||||
|
||||
3. Create a function that takes as input the DataFrame with the data set and returns a DataFrame with updated types: |
||||
|
||||
|
||||
``` |
||||
def update_types(df): |
||||
#TODO |
||||
return df |
||||
``` |
||||
|
||||
4. Use `describe` to have an overview on the data set |
||||
5. Delete the rows with missing values |
||||
6. Modify `Sub_metering_1` by multplying it by 0.06 |
||||
|
||||
|
||||
7. Select all the rows for which the Date is greater than 2008-12-27 and `Voltage` is greater than 242 |
||||
|
||||
8. Print the 88888th row. |
||||
9. What is the date for which the `Global_active_power` is maximal ? |
||||
10. Sort the first three columns by descending order of `Global_active_power` and ascending order of `Voltage`. |
||||
|
||||
11. Compute the daily average of `Global_active_power`. |
||||
|
||||
|
||||
|
||||
## Correction: |
||||
|
||||
1. `del` works but it is not a solution I recommand. For this exercice it is accepted. It is expected to use `drop` with `axis=1`. `inplace=True` may be useful to avoid to affect the result to a variable. |
||||
|
||||
2. The prefered solution is `set_index` with `inplace=True`. As long as the DataFrame returns the output below, the solution is accepted. If the type of the index is not `dtype='datetime64[ns]'` the solution is not accepted. |
||||
|
||||
|
||||
``` |
||||
Input: df.head().index |
||||
|
||||
Output: |
||||
|
||||
DatetimeIndex(['2006-12-16', '2006-12-16','2006-12-16', '2006-12-16','2006-12-16'], |
||||
dtype='datetime64[ns]', name='Date', freq=None) |
||||
|
||||
``` |
||||
|
||||
3. The prefered solution is `pd.to_numeric` with `coerce=True`. The solution is accepted if all types are `float64`. |
||||
|
||||
``` |
||||
Input: df.dtypes |
||||
|
||||
Output: |
||||
|
||||
Global_active_power float64 |
||||
Global_reactive_power float64 |
||||
Voltage float64 |
||||
Global_intensity float64 |
||||
Sub_metering_1 float64 |
||||
dtype: object |
||||
|
||||
``` |
||||
|
||||
4. `df.describe()` is expected |
||||
|
||||
5. You should have noticed that 25979 rows contain missing values (for a total of 129895). `df.isna().sum()` allows to check the number of missing values and `df.dropna()` with `inplace=True`. The solution is accepted if you used `dropna` and have the same number of missing values |
||||
|
||||
6. Two solutions are accepted: |
||||
- `df.loc[:,'A'] = df['A'] * 0.06` |
||||
- Using `apply` and `df.loc[:,'A']` = |
||||
|
||||
You may wonder `df.loc[:,'A']` is required and if `df['A'] = ...` works too. The answer is no. This is important in Pandas. Depending on the version of Pandas, it may return a warning. The reason is that you are affecting a value to a **copy** of the DataFrame and not in the DataFrame. |
||||
More details: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas |
||||
|
||||
7. The solution is accepted as long as the output of `print(filtered_df.head().to_markdown())` is |
||||
|
||||
|
||||
| Date | Global_active_power | Global_reactive_power | |
||||
|:--------------------|----------------------:|------------------------:| |
||||
| 2008-12-27 00:00:00 | 0.996 | 0.066 | |
||||
| 2008-12-27 00:00:00 | 1.076 | 0.162 | |
||||
| 2008-12-27 00:00:00 | 1.064 | 0.172 | |
||||
| 2008-12-27 00:00:00 | 1.07 | 0.174 | |
||||
| 2008-12-27 00:00:00 | 0.804 | 0.184 | |
||||
|
||||
Check that the number of rows is equal to **449667**. |
||||
|
||||
8. The solution is accepted if output is |
||||
|
||||
``` |
||||
Global_active_power 0.254 |
||||
Global_reactive_power 0.000 |
||||
Voltage 238.350 |
||||
Global_intensity 1.200 |
||||
Sub_metering_1 0.000 |
||||
Name: 2007-02-16 00:00:00, dtype: float64 |
||||
|
||||
``` |
||||
|
||||
9. The solution is accepted if the output is `Timestamp('2009-02-22 00:00:00')` |
||||
|
||||
10. The solution is accepted if the output for print(sorted_df.tail().to_markdown()) is |
||||
|
||||
| Date | Global_active_power | Global_reactive_power | Voltage | |
||||
|:--------------------|----------------------:|------------------------:|----------:| |
||||
| 2008-08-28 00:00:00 | 0.076 | 0 | 234.88 | |
||||
| 2008-08-28 00:00:00 | 0.076 | 0 | 235.18 | |
||||
| 2008-08-28 00:00:00 | 0.076 | 0 | 235.4 | |
||||
| 2008-08-28 00:00:00 | 0.076 | 0 | 235.64 | |
||||
| 2008-12-08 00:00:00 | 0.076 | 0 | 236.5 | |
||||
|
||||
|
||||
11. The solution is based on `groupby` which creates groups based on the index `Date` and agregates the groups using the `mean`. The solution is accepted if the output is |
||||
|
||||
``` |
||||
Date |
||||
2006-12-16 3.053475 |
||||
2006-12-17 2.354486 |
||||
2006-12-18 1.530435 |
||||
2006-12-19 1.157079 |
||||
2006-12-20 1.545658 |
||||
... |
||||
2010-12-07 0.770538 |
||||
2010-12-08 0.367846 |
||||
2010-12-09 1.119508 |
||||
2010-12-10 1.097008 |
||||
2010-12-11 1.275571 |
||||
Name: Global_active_power, Length: 1433, dtype: float64 |
||||
``` |
||||
|
||||
|
||||
|
||||
|
||||
# Exercice 3: E-commerce purchases |
||||
|
||||
The goal of this exercice is to learn to manipulate real data with Pandas. This exercice is less guided since the exercice 2 should have given you a nice introduction. |
||||
|
||||
The data set used is **E-commerce purchases**. |
||||
|
||||
Questions: |
||||
1. How many rows and columns are there? |
||||
2. What is the average Purchase Price? |
||||
3. What were the highest and lowest purchase prices? |
||||
4. How many people have English `'en'` as their Language of choice on the website? |
||||
5. How many people have the job title of `"Lawyer"` ? |
||||
6. How many people made the purchase during the `AM` and how many people made the purchase during `PM` ? |
||||
7. What are the 5 most common Job Titles? |
||||
8. Someone made a purchase that came from Lot: `"90 WT"` , what was the Purchase Price for this transaction? |
||||
9. What is the email of the person with the following Credit Card Number: `4926535242672853` |
||||
10. How many people have American Express as their Credit Card Provider and made a purchase above `$95` ? |
||||
11. How many people have a credit card that expires in `2025`? |
||||
12. What are the top 5 most popular email providers/hosts (e.g. gmail.com, yahoo.com, etc...) |
||||
|
||||
## Correction |
||||
The validate this exercice all answers should return the expected numerical value given in the correction AND uses Pandas. For example using NumPy to compute the mean doesn't respect the philosophy of the exercice which is to use Pandas. |
||||
|
||||
1. How many rows and columns are there?**10000 entries** |
||||
|
||||
There many solutions based on: shape, info, describe |
||||
|
||||
2. What is the average Purchase Price? **50.34730200000025** |
||||
|
||||
Even if `np.mean` gives the solution, `df['Purchase Price'].mean()` is preferred |
||||
|
||||
3. What were the highest and lowest purchase prices? |
||||
|
||||
min: 0 |
||||
|
||||
max: 99.989999999999995 |
||||
|
||||
4. How many people have English `'en'` as their Language of choice on the website? **1098** |
||||
|
||||
5. How many people have the job title of `"Lawyer"` ? **30** |
||||
|
||||
6. How many people made the purchase during the `AM` and how many people made the purchase during `PM` ? |
||||
|
||||
PM: 5068 |
||||
|
||||
AM: 4932 |
||||
|
||||
There many ways to the solution but the goal of this question was to make you use `value_counts` |
||||
|
||||
7. What are the 5 most common Job Titles? |
||||
|
||||
Interior and spatial designer 31 |
||||
|
||||
Lawyer 30 |
||||
|
||||
Social researcher 28 |
||||
|
||||
Purchasing manager 27 |
||||
|
||||
Designer, jewellery 27 |
||||
|
||||
There many ways to the solution but the goal of this question was to make you use `value_counts` |
||||
|
||||
8. Someone made a purchase that came from Lot: `"90 WT"` , what was the Purchase Price for this transaction? **75.1** |
||||
9. What is the email of the person with the following Credit Card Number: `4926535242672853`. **bondellen@williams-garza.com** |
||||
10. How many people have American Express as their Credit Card Provider and made a purchase above `$95` ? **39** |
||||
|
||||
The prefered solution is based on this: |
||||
|
||||
`df[(df['A'] == X) & (df['B'] > Y)]` |
||||
11. How many people have a credit card that expires in `2025`? **1033** |
||||
|
||||
The prefered solution is based on the usage of `apply` on a `lambda` function that slices the string that contains the expiration date. |
||||
|
||||
12. What are the top 5 most popular email providers/hosts (e.g. gmail.com, yahoo.com, etc...) |
||||
|
||||
- hotmail.com 1638 |
||||
- yahoo.com 1616 |
||||
- gmail.com 1605 |
||||
- smith.com 42 |
||||
- williams.com 37 |
||||
|
||||
The prefered solution is based on the usage of `apply` on a `lambda` function that slices the string that contains the email. The `lambda` function uses `split` to split the string on `@`. Finally, `value_counts` is used to count the occurences. |
||||
|
||||
# Exercice 3 Handling missing values |
||||
|
||||
The goal of this exercice is to learn to handle missing values. In the previsous exercice we used the first techniques: filter out the missing values. We were lucky because the proportion of missing values was low. But in some cases, dropping the missing values is not possible because the filtered data set would be too small. |
||||
This article explains the different types of missing data and how they should be handled. https://towardsdatascience.com/data-cleaning-with-python-and-pandas-detecting-missing-values-3e9c6ebcf78b |
||||
|
||||
" |
||||
**It’s important to understand these different types of missing data from a statistics point of view. The type of missing data will influence how you deal with filling in the missing values.**" |
||||
|
||||
1. Drop the `flower` column |
||||
- Fill the missing values with a different "strategy" for each column: |
||||
|
||||
`sepal_length` -> `mean` |
||||
|
||||
`sepal_width` -> `median` |
||||
|
||||
`petal_length`, `petal_width` -> `0` |
||||
|
||||
2. Explain why filling the missing values with 0 or the mean is a bad idea |
||||
3. Fill the missing values using the median |
||||
|
||||
|
||||
|
||||
## Correction |
||||
|
||||
To validate the exercice, you should have done these two steps in that order: |
||||
- Convert the numerical columns to `float` |
||||
|
||||
``` |
||||
example: |
||||
pd.to_numeric(df.loc[:,col], errors='coerce') |
||||
``` |
||||
- Fill the missing values. There are many solutions for this step, here is one of them. |
||||
``` |
||||
example: |
||||
df.fillna({0:df.sepal_length.mean(), |
||||
2:df.sepal_width.median(), |
||||
3:0, |
||||
4:0}) |
||||
``` |
||||
- It is important to understand why filling the missing values with 0 or the mean of the column is a bad idea. |
||||
|
||||
| | sepal_length | sepal_width | petal_length | petal_width | |
||||
|:------|---------------:|--------------:|---------------:|--------------:| |
||||
| count | 146 | 141 | 120 | 147 | |
||||
| mean | 56.9075 | 52.6255 | 15.5292 | 12.0265 | |
||||
| std | 572.222 | 417.127 | 127.46 | 131.873 | |
||||
| min | -4.4 | -3.6 | -4.8 | -2.5 | |
||||
| 25% | 5.1 | 2.8 | 2.725 | 0.3 | |
||||
| 50% | 5.75 | 3 | 4.5 | 1.3 | |
||||
| 75% | 6.4 | 3.3 | 5.1 | 1.8 | |
||||
| max | 6900 | 3809 | 1400 | 1600 | |
||||
|
||||
Once we filled the missing values as suggested in the first question, `df.describe()` returns this intersting summary. We notice that the mean is way higher than the median. It means that there are maybe some outliers in the data. The quantile 75 and the max confirm that: 75% of the flowers have a sepal length smaller than 6.4 cm, but the max is 6900 cm. If you check on the internet you realise this small flower can't be that big. The outliers have a major impact on the mean which equals to 56.9. Filling this value for the missing value is not correct since it doesn't correspond to the real size of this flower. That's why in that case the best strategy to fill the missing values was the median. The truth is that I modified the data set ! But real data sets ALWAYS contains ouliers. |
||||
|
||||
Bonus: |
||||
- If you noticed that there are some negative values and the huge values, you will be a good data scientist. **YOU SHOULD ALWAYS TRY TO UNDERSTAND YOUR DATA**. Print the row with index 122 ;-) |
||||
|
||||
This week, we will have the opportunity to focus on the data pre-processing to understand how the outliers are handled. |
||||
|
||||
|
||||
|
||||
EXos Ă ajouter: |
||||
|
||||
Créer une Series |
||||
train_test_split |
||||
Ajouter 3 exos sur les fontions natives incontournable de Pandas |
||||
|
||||
dropna |
@ -0,0 +1,339 @@
|
||||
# D03 Piscine AI - Data Science |
||||
|
||||
|
||||
Author: |
||||
|
||||
|
||||
|
||||
# Introduction |
||||
|
||||
While working on a dataset it is important to check the distribution of the data. Obviously, for most of humans it is difficult to visualize the data in more than 3 dimensions |
||||
|
||||
Viz is important to understand the data and to show results. We have already seen there are some basinc viz functionalities in Pandas. |
||||
Now we'll discover two of the most know viz libraries in Python: |
||||
- Pandas viz |
||||
- Matplotlib |
||||
- Plotly |
||||
|
||||
|
||||
Pandas viz is pratique: rapid plot, relies on Matplotlib. (check matplotlib doc sometimes not all params are detailed in pandas doc) |
||||
For more elaborate plots Matplotlib is necessary |
||||
|
||||
And finaly Plotly is a interactive plot library. |
||||
|
||||
## Rules |
||||
Always a title, legend, ... |
||||
|
||||
## Ressources |
||||
https://matplotlib.org/3.3.3/tutorials/index.html |
||||
https://towardsdatascience.com/matplotlib-tutorial-learn-basics-of-pythons-powerful-plotting-library-b5d1b8f67596 |
||||
|
||||
https://github.com/rougier/matplotlib-tutorial |
||||
https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html |
||||
|
||||
|
||||
|
||||
# Exercice 1 Pandas plot 1 |
||||
|
||||
The goal of this exercice is to learn to create plots with use Pandas. Panda's `.plot()` is a wrapper for `matplotlib.pyplot.plot()`. |
||||
|
||||
Here is the data we will be using: |
||||
|
||||
``` |
||||
df = pd.DataFrame({ |
||||
'name':['christopher','marion','maria','mia','clement','randy','remi'], |
||||
'age':[70,30,22,19,45,33,20], |
||||
'gender':['M','F','F','F','M','M','M'], |
||||
'state':['california','dc','california','dc','california','new york','porto'], |
||||
'num_children':[2,0,0,3,8,1,4], |
||||
'num_pets':[5,1,0,5,2,2,3] |
||||
}) |
||||
``` |
||||
1. Reproduce this plot. This plot is called a bar plot |
||||
|
||||
|
||||
![alt text][logo] |
||||
|
||||
[logo]: images/day03/w1day03_ex1_plot1.png "Bar plot ex1" |
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- name on x-axis |
||||
- legend |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot reproduces the plot in the image. It has to contain a title, an x-axis name and a legend. |
||||
![alt text][logo] |
||||
|
||||
[logo]: images/day03/w1day03_ex1_plot1.png "Bar plot ex1" |
||||
|
||||
|
||||
## Exercice 2: Pandas plot 2 |
||||
|
||||
The goal of this exercice is to learn to create plots with use Pandas. Panda's `.plot()` is a wrapper for `matplotlib.pyplot.plot()`. |
||||
|
||||
|
||||
``` |
||||
df = pd.DataFrame({ |
||||
'name':['christopher','marion','maria','mia','clement','randy','remi'], |
||||
'age':[70,30,22,19,45,33,20], |
||||
'gender':['M','F','F','F','M','M','M'], |
||||
'state':['california','dc','california','dc','california','new york','porto'], |
||||
'num_children':[2,0,0,3,8,1,4], |
||||
'num_pets':[5,1,0,5,2,2,3] |
||||
}) |
||||
``` |
||||
|
||||
1. Reproduce this plot. This plot is called a scatter plot. Do you observe a relationship between the age and the number of children ? |
||||
|
||||
![alt text][logo_ex2] |
||||
|
||||
[logo_ex2]: images/day03/w1day03_ex2_plot1.png "Scatter plot ex2" |
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- name on x-axis |
||||
- name on y-axis |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot reproduces the plot in the image. It has to contain a title, an x-axis name and an y-axis name. |
||||
You should also observe that the older people are the bigger the number of children is. |
||||
|
||||
![alt text][logo_ex2] |
||||
|
||||
[logo_ex2]: images/day03/w1day03_ex2_plot1.png "Scatter plot ex2" |
||||
|
||||
|
||||
|
||||
|
||||
## Exercice 3 Matplotlib 1 |
||||
|
||||
The goal of this plot is to learn to use Matplotlib to plot data. As you know, Matplotlib is the underlying library used by Pandas. It provides more options to plot custom visualizations. Howerver, most of the plots we will create with Matplotlib can be reproduced with Pandas' `.plot()`. |
||||
|
||||
|
||||
1. Reproduce this plot. We assume the datapoints have integers coordinates. |
||||
|
||||
![alt text][logo_ex3] |
||||
|
||||
[logo_ex3]: images/day03/w1day03_ex3_plot1.png "Scatter plot ex3" |
||||
|
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- name on x-axis and y-axis |
||||
- x-axis and y-axis are limited to [1,8] |
||||
- **style**: |
||||
|
||||
- red dashdot line with a width of 3 |
||||
- blue circles with a size of 12 |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot reproduces the plot in the image and respect those criterias |
||||
|
||||
- the title |
||||
- name on x-axis and y-axis |
||||
- x-axis and y-axis are limited to [1,8] |
||||
- **style**: |
||||
|
||||
- red dashdot line with a width of 3 |
||||
- blue circles with a size of 12 |
||||
|
||||
![alt text][logo_ex3] |
||||
|
||||
[logo_ex3]: images/day03/w1day03_ex3_plot1.png "Scatter plot ex3" |
||||
|
||||
# Exercice 4 Matplotlib 2 |
||||
The goal of this plot is to learn to use Matplotlib to plot different lines in the same plot on different axis using `twinx`. This very useful to compare variables in different ranges. |
||||
|
||||
Here is the data: |
||||
|
||||
``` |
||||
left_data = [5, 7, 11, 13, 17] |
||||
right_data = [0.1, 0.2, 0.4, 0.8, -1.6] |
||||
x_axis = [0.0, 1.0, 2.0, 3.0, 4.0] |
||||
``` |
||||
1. Reproduce this plot |
||||
![alt text][logo_ex4] |
||||
|
||||
[logo_ex4]: images/day03/w1day03_ex4_plot1.png "Twin axis plot ex4" |
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- name on left y-axis and right y-axis |
||||
- **style**: |
||||
|
||||
- left data in black |
||||
- right data in red |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot reproduces the plot in the image and respect those criterias |
||||
|
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- name on left y-axis and right y-axis |
||||
- **style**: |
||||
|
||||
- left data in black |
||||
- right data in red |
||||
|
||||
![alt text][logo_ex4] |
||||
|
||||
[logo_ex4]: images/day03/w1day03_ex4_plot1.png "Twin axis ex4" |
||||
|
||||
https://matplotlib.org/gallery/api/two_scales.html |
||||
|
||||
# Exercice 5 Matplotlib subplots |
||||
The goal of this exerice is to learn to use Matplotlib to create subplots. |
||||
|
||||
1. Reproduce this plot using a **for loop**: |
||||
|
||||
![alt text][logo_ex5] |
||||
|
||||
[logo_ex5]: images/day03/w1day03_ex5_plot1.png "Subplots ex5" |
||||
|
||||
The plot has to contain: |
||||
|
||||
- 6 subplots: 2 rows, 3 columns |
||||
- Keep space between plots: `hspace=0.5` and `wspace=0.5` |
||||
- Each plot contains |
||||
|
||||
- Text (2,3,i) centered at 0.5, 0.5. *Hint*: check the parameter `ha` of `text` |
||||
- a title: Title i |
||||
|
||||
## Correction |
||||
|
||||
1. The question is validated if the plot reproduces the image and the given criterias: |
||||
|
||||
The plot has to contain: |
||||
|
||||
- 6 subplots: 2 rows, 3 columns |
||||
- Keep space between plots: `hspace=0.5` and `wspace=0.5` |
||||
- Each plot contains |
||||
|
||||
- Text (2,3,i) centered at 0.5, 0.5. *Hint*: check the parameter `ha` of `text` |
||||
- a title: Title i |
||||
|
||||
![alt text][logo_ex5] |
||||
|
||||
[logo_ex5]: images/day03/w1day03_ex5_plot1.png "Subplots ex5" |
||||
|
||||
Check that the plot has been created with a for loop. |
||||
|
||||
# Exercice 6 Plotly 1 |
||||
Plotly has evolved a lot in the previous years. It is important to **always check the documentation**. |
||||
|
||||
Plotly comes with a high level interface: Plotly Express. It helps building some complex plots easily. The lesson won't detail the complex examples. Plotly express is quite interesting while using Pandas Dataframes because there are some built-in functions that leverage Pandas Dataframes. |
||||
|
||||
The plot outputed by Plotly is interactive and can also be dynamic. |
||||
|
||||
The goal of the exercice is to plot the price of a company. Its price is generated below. |
||||
|
||||
``` |
||||
returns = np.random.randn(50) |
||||
price = 100 + np.cumsum(returns) |
||||
|
||||
dates = pd.date_range(start='2020-09-01', periods=50, freq='B') |
||||
df = pd.DataFrame(zip(dates, price), |
||||
columns=['Date','Company_A']) |
||||
``` |
||||
|
||||
1. Using Plotly express, reproduce the plot in the image. As the data is generated randomly I do not expect you to reproduce the same line. |
||||
|
||||
![alt text][logo_ex6] |
||||
|
||||
[logo_ex6]: images/day03/w1day03_ex6_plot1.png "Time series ex6" |
||||
|
||||
The plot has to contain: |
||||
|
||||
- title |
||||
- x-axis name |
||||
- yaxis name |
||||
|
||||
2. Same question but now using `plotly.graph_objects`. You may need to use `init_notebook_mode` from `plotly.offline`. |
||||
|
||||
https://plotly.com/python/time-series/e |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot is in the image is reproduced using Plotly express given those criterias: |
||||
|
||||
The plot has to contain: |
||||
|
||||
- a title |
||||
- x-axis name |
||||
- yaxis name |
||||
![alt text][logo_ex6] |
||||
|
||||
[logo_ex6]: images/day03/w1day03_ex6_plot1.png "Time series ex6" |
||||
|
||||
2. This question is validated if the plot is in the image is reproduced using `plotly.graph_objects` given those criterias: |
||||
|
||||
The plot has to contain: |
||||
|
||||
- a title |
||||
- x-axis name |
||||
- yaxis name |
||||
|
||||
![alt text][logo_ex6] |
||||
|
||||
[logo_ex6]: images/day03/w1day03_ex6_plot1.png "Time series ex6" |
||||
|
||||
# Exercice 7 Plotly Box plots |
||||
|
||||
The goal of this exercice is to learn to use Plotly to plot Box Plots. It is t is a method for graphically depicting groups of numerical data through their quartiles and values as min, max. It allows to compare quickly some variables. |
||||
|
||||
Let us generate 3 random arrays from a normal distribution. And for each array add respectively 1, 2 to the normal distribution. |
||||
|
||||
``` |
||||
y0 = np.random.randn(50) |
||||
y1 = np.random.randn(50) + 1 # shift mean |
||||
y2 = np.random.randn(50) + 2 |
||||
``` |
||||
1. Plot in the same Figure 2 box plots as shown in the image. In this exercice the style is not important. |
||||
|
||||
![alt text][logo_ex7] |
||||
|
||||
[logo_ex7]: images/day03/w1day03_ex7_plot1.png "Box plot ex7" |
||||
|
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- the legend |
||||
https://plotly.com/python/box-plots/ |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot is in the image is reproduced given those criterias: |
||||
|
||||
The plot has to contain: |
||||
|
||||
- the title |
||||
- the legend |
||||
|
||||
![alt text][logo_ex7] |
||||
|
||||
[logo_ex7]: images/day03/w1day03_ex7_plot1.png "Box plot ex7" |
||||
|
||||
|
||||
``` |
||||
import plotly.graph_objects as go |
||||
import numpy as np |
||||
|
||||
y0 = np.random.randn(50) |
||||
y1 = np.random.randn(50) + 1 # shift mean |
||||
y2 = np.random.randn(50) + 2 |
||||
|
||||
fig = go.Figure() |
||||
fig.add_trace(go.Box(y=y0, name='Sample A', |
||||
marker_color = 'indianred')) |
||||
fig.add_trace(go.Box(y=y1, name = 'Sample B', |
||||
marker_color = 'lightseagreen')) |
||||
|
||||
fig.show() |
||||
``` |
@ -0,0 +1,402 @@
|
||||
# D04 Piscine AI - Data Science |
||||
|
||||
|
||||
Author: |
||||
|
||||
# Table of Contents: |
||||
Historical part: |
||||
|
||||
Data wrangling, unify source of data ... |
||||
# Introduction |
||||
|
||||
|
||||
|
||||
... |
||||
## Ressources |
||||
Pandas website |
||||
- https://jakevdp.github.io/PythonDataScienceHandbook/ |
||||
|
||||
https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf |
||||
|
||||
|
||||
https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/ |
||||
|
||||
https://towardsdatascience.com/different-ways-to-iterate-over-rows-in-a-pandas-dataframe-performance-comparison-dc0d5dcef8fe |
||||
|
||||
|
||||
|
||||
# Exercice 1 Concatenate |
||||
|
||||
The goal of this exercice is to learn to concatenate DataFrames. The logic is the same for the Series. |
||||
|
||||
Here are the two DataFrames to concatenate: |
||||
|
||||
|
||||
``` |
||||
df1 = pd.DataFrame([['a', 1], ['b', 2]], |
||||
columns=['letter', 'number']) |
||||
df2 = pd.DataFrame([['c', 1], ['d', 2]], |
||||
columns=['letter', 'number']) |
||||
|
||||
``` |
||||
|
||||
1. Concatenate this two DataFrames on index axis and reset the index. The index of the outputted should be `RangeIndex(start=0, stop=4, step=1)`. **Do not change the index manually**. |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the outputted DataFrame is: |
||||
|
||||
| | letter | number | |
||||
|---:|:---------|---------:| |
||||
| 0 | a | 1 | |
||||
| 1 | b | 2 | |
||||
| 2 | c | 1 | |
||||
| 3 | d | 2 | |
||||
|
||||
|
||||
# Exercice 2 Merge |
||||
|
||||
The goal of this exercice is to learn to merge DataFrames |
||||
The logic of merging DataFrames in Pandas is quite similar as the one used in SQL. |
||||
|
||||
Here are the two DataFrames to merge: |
||||
|
||||
``` |
||||
#df1 |
||||
|
||||
df1_dict = { |
||||
'id': ['1', '2', '3', '4', '5'], |
||||
'Feature1': ['A', 'C', 'E', 'G', 'I'], |
||||
'Feature2': ['B', 'D', 'F', 'H', 'J']} |
||||
|
||||
df1 = pd.DataFrame(df1_dict, columns = ['id', 'Feature1', 'Feature2']) |
||||
|
||||
#df2 |
||||
df2_dict = { |
||||
'id': ['1', '2', '6', '7', '8'], |
||||
'Feature1': ['K', 'M', 'O', 'Q', 'S'], |
||||
'Feature2': ['L', 'N', 'P', 'R', 'T']} |
||||
|
||||
df2 = pd.DataFrame(df2_dict, columns = ['id', 'Feature1', 'Feature2']) |
||||
``` |
||||
1. Merge the two DataFrames to get this output: |
||||
|
||||
| | id | Feature1_x | Feature2_x | Feature1_y | Feature2_y | |
||||
|---:|-----:|:-------------|:-------------|:-------------|:-------------| |
||||
| 0 | 1 | A | B | K | L | |
||||
| 1 | 2 | C | D | M | N | |
||||
|
||||
2. Merge the two DataFrames to get this output: |
||||
|
||||
| | id | Feature1_df1 | Feature2_df1 | Feature1_df2 | Feature2_df2 | |
||||
|---:|-----:|:---------------|:---------------|:---------------|:---------------| |
||||
| 0 | 1 | A | B | K | L | |
||||
| 1 | 2 | C | D | M | N | |
||||
| 2 | 3 | E | F | nan | nan | |
||||
| 3 | 4 | G | H | nan | nan | |
||||
| 4 | 5 | I | J | nan | nan | |
||||
| 5 | 6 | nan | nan | O | P | |
||||
| 6 | 7 | nan | nan | Q | R | |
||||
| 7 | 8 | nan | nan | S | T | |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output is: |
||||
|
||||
| | id | Feature1_x | Feature2_x | Feature1_y | Feature2_y | |
||||
|---:|-----:|:-------------|:-------------|:-------------|:-------------| |
||||
| 0 | 1 | A | B | K | L | |
||||
| 1 | 2 | C | D | M | N | |
||||
|
||||
2. This question is validated if the output is: |
||||
|
||||
| | id | Feature1_df1 | Feature2_df1 | Feature1_df2 | Feature2_df2 | |
||||
|---:|-----:|:---------------|:---------------|:---------------|:---------------| |
||||
| 0 | 1 | A | B | K | L | |
||||
| 1 | 2 | C | D | M | N | |
||||
| 2 | 3 | E | F | nan | nan | |
||||
| 3 | 4 | G | H | nan | nan | |
||||
| 4 | 5 | I | J | nan | nan | |
||||
| 5 | 6 | nan | nan | O | P | |
||||
| 6 | 7 | nan | nan | Q | R | |
||||
| 7 | 8 | nan | nan | S | T | |
||||
|
||||
Note: Check that the suffixes are set using the suffix parameters rather than manually changing the columns' name. |
||||
|
||||
|
||||
## Exercice 3 Merge MultiIndex |
||||
|
||||
The goal of this exercice is to learn to merge DataFrames with MultiIndex. |
||||
Use the code below to generate the DataFrames. `market_data` contains fake market data. In finance, the market is available during the trading days (business days). `alternative_data` contains fake alternative data from social media. This data is available every day. But, for some reasons the Data Engineer lost the last 15 days of alternative data. |
||||
|
||||
1. Using `market_data` as the reference, merge `alternative_data` on `market_data` |
||||
|
||||
``` |
||||
#generate days |
||||
all_dates = pd.date_range('2021-01-01', '2021-12-15') |
||||
business_dates = pd.bdate_range('2021-01-01', '2021-12-31') |
||||
|
||||
#generate tickers |
||||
tickers = ['AAPL', 'FB', 'GE', 'AMZN', 'DAI'] |
||||
|
||||
#create indexs |
||||
index_alt = pd.MultiIndex.from_product([all_dates, tickers], names=['Date', 'Ticker']) |
||||
index = pd.MultiIndex.from_product([business_dates, tickers], names=['Date', 'Ticker']) |
||||
|
||||
# create DFs |
||||
market_data = pd.DataFrame(index=index, |
||||
data=np.random.randn(len(index), 3), |
||||
columns=['Open','Close','Close_Adjusted']) |
||||
|
||||
alternative_data = pd.DataFrame(index=index_alt, |
||||
data=np.random.randn(len(index_alt), 2), |
||||
columns=['Twitter','Reddit']) |
||||
|
||||
``` |
||||
|
||||
`reset_index` is not allowed for this question |
||||
|
||||
2. Fill missing values with 0 |
||||
|
||||
https://medium.com/swlh/merging-dataframes-with-pandas-pd-merge-7764c7e2d46d |
||||
|
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the outputted DataFrame's shape is `(1305, 5)` and if `merged.head()` returns: |
||||
|
||||
| | Open | Close | Close_Adjusted | Twitter | Reddit | |
||||
|:-----------------------------------------------------|-----------:|----------:|-----------------:|------------:|----------:| |
||||
| (Timestamp('2021-01-01 00:00:00', freq='B'), 'AAPL') | 0.0991792 | -0.31603 | 0.634787 | -0.00159041 | 1.06053 | |
||||
| (Timestamp('2021-01-01 00:00:00', freq='B'), 'FB') | -0.123753 | 1.00269 | 0.713264 | 0.0142127 | -0.487028 | |
||||
| (Timestamp('2021-01-01 00:00:00', freq='B'), 'GE') | -1.37775 | -1.01504 | 1.2858 | 0.109835 | 0.04273 | |
||||
| (Timestamp('2021-01-01 00:00:00', freq='B'), 'AMZN') | 1.06324 | 0.841241 | -0.799481 | -0.805677 | 0.511769 | |
||||
| (Timestamp('2021-01-01 00:00:00', freq='B'), 'DAI') | -0.603453 | -2.06141 | -0.969064 | 1.49817 | 0.730055 | |
||||
|
||||
One of the answers that returns the correct DataFrame is: |
||||
|
||||
`market_data.merge(alternative_data, how='left', left_index=True, right_index=True)` |
||||
|
||||
2. This question is validated if the number of missing in the DataFrame is equal to 0 and if `filled_df.sum().sum() == merged_df.sum().sum()` gives: `True` |
||||
|
||||
|
||||
# Exercice 4 Groupby Apply |
||||
|
||||
The goal of this exercice is to learn to group the data and apply a function on the groups. |
||||
The use case we will work on is computing |
||||
|
||||
1. Create a function that uses `pandas.DataFrame.clip` and that replace extreme values by a given percentile. The values that are greater than the upper percentile 80% are replaced by the percentile 80%. The values that are smaller than the lower percentile 20% are replaced by the percentile 20%. This process that correct outliers is called **winsorizing**. |
||||
I recommend to use NumPy to compute the percentiles to make sure we used the same defaut parameters. |
||||
|
||||
|
||||
``` |
||||
def winsorize(df, quantiles): |
||||
""" |
||||
df: pd.DataFrame |
||||
quantiles: list |
||||
ex: [0.05, 0.95] |
||||
""" |
||||
#TODO |
||||
return |
||||
``` |
||||
Here is what the function should output: |
||||
|
||||
``` |
||||
df = pd.DataFrame(range(1,11), columns=['sequence']) |
||||
print(winsorize(df, [0.20, 0.80]).to_markdown()) |
||||
|
||||
``` |
||||
|
||||
|
||||
| | sequence | |
||||
|---:|-----------:| |
||||
| 0 | 2.8 | |
||||
| 1 | 2.8 | |
||||
| 2 | 3 | |
||||
| 3 | 4 | |
||||
| 4 | 5 | |
||||
| 5 | 6 | |
||||
| 6 | 7 | |
||||
| 7 | 8 | |
||||
| 8 | 8.2 | |
||||
| 9 | 8.2 | |
||||
|
||||
|
||||
2. Now we consider that each value belongs to a group. The goal is to apply the **winsorizing to each group**. In this question we use winsorizing values that are common: `[0.05,0.95]` as percentiles. Here is the new data set: |
||||
|
||||
``` |
||||
groups = np.concatenate([np.ones(10), np.ones(10)+1, np.ones(10)+2, np.ones(10)+3, np.ones(10)+4]) |
||||
|
||||
df = pd.DataFrame(data= zip(groups, |
||||
range(1,51)), |
||||
columns=["group", "sequence"]) |
||||
``` |
||||
The expected output (first rows) is: |
||||
|
||||
| | sequence | |
||||
|---:|-----------:| |
||||
| 0 | 1.45 | |
||||
| 1 | 2 | |
||||
| 2 | 3 | |
||||
| 3 | 4 | |
||||
| 4 | 5 | |
||||
| 5 | 6 | |
||||
| 6 | 7 | |
||||
| 7 | 8 | |
||||
| 8 | 9 | |
||||
| 9 | 9.55 | |
||||
| 10 | 11.45 | |
||||
|
||||
|
||||
## Correction |
||||
The for loop is forbidden in this exercice. The goal is to use `groupby` and `apply`. |
||||
|
||||
1. This question is validated if the output is: |
||||
|
||||
``` |
||||
df = pd.DataFrame(range(1,11), columns=['sequence']) |
||||
print(winsorize(df, [0.20, 0.80]).to_markdown()) |
||||
|
||||
``` |
||||
|
||||
|
||||
| | sequence | |
||||
|---:|-----------:| |
||||
| 0 | 2.8 | |
||||
| 1 | 2.8 | |
||||
| 2 | 3 | |
||||
| 3 | 4 | |
||||
| 4 | 5 | |
||||
| 5 | 6 | |
||||
| 6 | 7 | |
||||
| 7 | 8 | |
||||
| 8 | 8.2 | |
||||
| 9 | 8.2 | |
||||
|
||||
|
||||
2. This question is validated if the output is the same as the one returned by: |
||||
|
||||
``` |
||||
def winsorize(df_series, quantiles): |
||||
""" |
||||
df: pd.DataFrame or pd.Series |
||||
quantiles: list [0.05, 0.95] |
||||
|
||||
""" |
||||
min_value = np.quantile(df_series, quantiles[0]) |
||||
max_value = np.quantile(df_series, quantiles[1]) |
||||
|
||||
return df_series.clip(lower = min_value, upper = max_value) |
||||
|
||||
|
||||
df.groupby("group")[['sequence']].apply(winsorize, [0.05,0.95]) |
||||
``` |
||||
The ouput can also be a Series instead of a DataFrame. |
||||
|
||||
The expected output (first rows) is: |
||||
|
||||
| | sequence | |
||||
|---:|-----------:| |
||||
| 0 | 1.45 | |
||||
| 1 | 2 | |
||||
| 2 | 3 | |
||||
| 3 | 4 | |
||||
| 4 | 5 | |
||||
| 5 | 6 | |
||||
| 6 | 7 | |
||||
| 7 | 8 | |
||||
| 8 | 9 | |
||||
| 9 | 9.55 | |
||||
| 10 | 11.45 | |
||||
|
||||
https://towardsdatascience.com/how-to-use-the-split-apply-combine-strategy-in-pandas-groupby-29e0eb44b62e |
||||
|
||||
|
||||
|
||||
# Exercice 5 Groupby Agg |
||||
|
||||
The goal of this exercice is to learn to compute different type of agregations on the groups. This small DataFrame contains products and prices. |
||||
|
||||
| | value | product | |
||||
|---:|--------:|:-------------| |
||||
| 0 | 20.45 | table | |
||||
| 1 | 22.89 | chair | |
||||
| 2 | 32.12 | chair | |
||||
| 3 | 111.22 | mobile phone | |
||||
| 4 | 33.22 | table | |
||||
| 5 | 100 | mobile phone | |
||||
| 6 | 99.99 | table | |
||||
|
||||
1. Compute the min, max and mean price for each product in one single line of code. The expected output is: |
||||
|
||||
| product | ('value', 'min') | ('value', 'max') | ('value', 'mean') | |
||||
|:-------------|-------------------:|-------------------:|--------------------:| |
||||
| chair | 22.89 | 32.12 | 27.505 | |
||||
| mobile phone | 100 | 111.22 | 105.61 | |
||||
| table | 20.45 | 99.99 | 51.22 | |
||||
|
||||
Note: The columns don't have to be MultiIndex |
||||
|
||||
## Correction |
||||
|
||||
1. The question is validated if the output is: |
||||
|
||||
| product | ('value', 'min') | ('value', 'max') | ('value', 'mean') | |
||||
|:-------------|-------------------:|-------------------:|--------------------:| |
||||
| chair | 22.89 | 32.12 | 27.505 | |
||||
| mobile phone | 100 | 111.22 | 105.61 | |
||||
| table | 20.45 | 99.99 | 51.22 | |
||||
|
||||
Note: The columns don't have to be MultiIndex |
||||
|
||||
My answer is: `df.groupby('product').agg({'value':['min','max','mean']})` |
||||
|
||||
# Exercice 6 Unstack |
||||
|
||||
The goal of this exercice is to learn to unstack a MultiIndex. |
||||
Let's assume we trained a machine learning model that predicts a daily score on the companies (tickers) below. It may be very useful to unstack the MultiIndex: plot the time series, vectorize the backtest etc ... |
||||
|
||||
``` |
||||
business_dates = pd.bdate_range('2021-01-01', '2021-12-31') |
||||
|
||||
#generate tickers |
||||
tickers = ['AAPL', 'FB', 'GE', 'AMZN', 'DAI'] |
||||
|
||||
#create indexs |
||||
index = pd.MultiIndex.from_product([business_dates, tickers], names=['Date', 'Ticker']) |
||||
|
||||
# create DFs |
||||
market_data = pd.DataFrame(index=index, |
||||
data=np.random.randn(len(index), 1), |
||||
columns=['Prediction']) |
||||
|
||||
``` |
||||
1. Unstack the DataFrame. |
||||
|
||||
The first 3 rows of the DataFrame should like this: |
||||
|
||||
| Date | ('Prediction', 'AAPL') | ('Prediction', 'AMZN') | ('Prediction', 'DAI') | ('Prediction', 'FB') | ('Prediction', 'GE') | |
||||
|:--------------------|-------------------------:|-------------------------:|------------------------:|-----------------------:|-----------------------:| |
||||
| 2021-01-01 00:00:00 | 0.382312 | -0.072392 | -0.551167 | -0.0585555 | 1.05955 | |
||||
| 2021-01-04 00:00:00 | -0.560953 | 0.503199 | -0.79517 | -3.23136 | 1.50271 | |
||||
| 2021-01-05 00:00:00 | 0.211489 | 1.84867 | 0.287906 | -1.81119 | 1.20321 | |
||||
|
||||
|
||||
2. Plot the 5 times series in the same plot using Pandas built-in visualisation functions with a title. |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This questions is validated is the output of `unstacked_df.head()` is |
||||
|
||||
| Date | ('Prediction', 'AAPL') | ('Prediction', 'AMZN') | ('Prediction', 'DAI') | ('Prediction', 'FB') | ('Prediction', 'GE') | |
||||
|:--------------------|-------------------------:|-------------------------:|------------------------:|-----------------------:|-----------------------:| |
||||
| 2021-01-01 00:00:00 | 0.382312 | -0.072392 | -0.551167 | -0.0585555 | 1.05955 | |
||||
| 2021-01-04 00:00:00 | -0.560953 | 0.503199 | -0.79517 | -3.23136 | 1.50271 | |
||||
| 2021-01-05 00:00:00 | 0.211489 | 1.84867 | 0.287906 | -1.81119 | 1.20321 | |
||||
|
||||
2. The question is validated if the answer is: `unstacked.plot(title = 'Stocks 2021')`. The title can be anything else. |
||||
|
||||
|
@ -0,0 +1,306 @@
|
||||
# D05 Piscine AI - Data Science |
||||
|
||||
The goal of this day is to understand practical usage of Pandas. |
||||
Today we will discover some important functionalities of Pandas. they will allow you to manipulate the data (DataFrame and Series) in order to clean, delete, add, merge and leverage more information. |
||||
|
||||
In Data Science this is crucial, because without cleaned data there's no algorithms learning. |
||||
|
||||
|
||||
|
||||
Author: |
||||
|
||||
# Table of Contents: |
||||
Historical part: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
Not only is the pandas library a central component of the data science toolkit but it is used in conjunction with other libraries in that collection. |
||||
|
||||
Pandas is built on top of the NumPy package, meaning a lot of the structure of NumPy is used or replicated in Pandas. Data in pandas is often used to feed statistical analysis in SciPy, plotting functfunctionsions from Matplotlib, and machine learning algorithms in Scikit-learn. |
||||
|
||||
## Historical |
||||
|
||||
## Rules |
||||
... |
||||
## Ressources |
||||
Pandas website |
||||
- https://jakevdp.github.io/PythonDataScienceHandbook/ |
||||
|
||||
https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf |
||||
https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/ |
||||
|
||||
|
||||
# Exercice 1 |
||||
|
||||
The goal of this exercice is to learn to manipulate time series in Pandas. |
||||
|
||||
1. Create a `Series` named `integer_series`from 1st January 2010 to 31 December 2020. At each date is associated the number of days since 1st January 2010. It starts with 0. |
||||
|
||||
2. Using Pandas, compute a 7 days moving average. This transformation smooths the time series by removing small fluctuations. **without for loop** |
||||
|
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output of is |
||||
|
||||
``` |
||||
2010-01-01 0 |
||||
2010-01-02 1 |
||||
2010-01-03 2 |
||||
2010-01-04 3 |
||||
2010-01-05 4 |
||||
... |
||||
2020-12-27 4013 |
||||
2020-12-28 4014 |
||||
2020-12-29 4015 |
||||
2020-12-30 4016 |
||||
2020-12-31 4017 |
||||
Freq: D, Name: integer_series, Length: 4018, dtype: int64 |
||||
``` |
||||
The best solution uses `pd.date_range` to generate the index and `range` to generate the integer series. |
||||
|
||||
2. This question is validated if the output is: |
||||
|
||||
``` |
||||
2010-01-01 NaN |
||||
2010-01-02 NaN |
||||
2010-01-03 NaN |
||||
2010-01-04 NaN |
||||
2010-01-05 NaN |
||||
... |
||||
2020-12-27 4010.0 |
||||
2020-12-28 4011.0 |
||||
2020-12-29 4012.0 |
||||
2020-12-30 4013.0 |
||||
2020-12-31 4014.0 |
||||
Freq: D, Name: integer_series, Length: 4018, dtype: float64 |
||||
``` |
||||
If the `NaN` values have been dropped the solution is also accepted. The solution uses `rolling().mean()`. |
||||
|
||||
# Exercice 2 |
||||
|
||||
The goal of this exercice is to learn to use Pandas on Time Series an on Financial data. |
||||
|
||||
The data we will use is Apple stock. |
||||
|
||||
1. Using `Plotly` plot a Candlestick |
||||
|
||||
2. Agregate the data to **last business day of each month**. The agregation should consider the meaning of the variables. How many months are in the considered period ? |
||||
|
||||
3. When comparing many stocks between them the metric which is frequently used is the return of the price. The price is not a convinient metric as the prices evolve in different ranges. The return at time t is defined as |
||||
|
||||
- (Price(t) - Price(t-1))/ Price(t-1) |
||||
|
||||
Using the open price compute the **daily return**. Propose two different ways **without for loop**. |
||||
|
||||
|
||||
## Correction: |
||||
Preliminary: |
||||
|
||||
- As usual the first steps are: |
||||
|
||||
- Check missing values and data types |
||||
- Convert string dates to datetime |
||||
- Set dates as index |
||||
- Use `info` or `describe` to have a first look at the data |
||||
|
||||
The exerice is not validated if these steps haven't been done. |
||||
1. The Candlestick is based on Open, High, Low and Close columns. The index is Date (datetime). As long as you inserted the right columns in `Candlestick` `Plotly` object you validate the question. |
||||
|
||||
2. This question is validated if the output of `print(transformed_df.head().to_markdown())` is |
||||
|
||||
| Date | Open | Close | Volume | High | Low | |
||||
|:--------------------|---------:|---------:|------------:|---------:|---------:| |
||||
| 1980-12-31 00:00:00 | 0.136075 | 0.135903 | 1.34485e+09 | 0.161272 | 0.112723 | |
||||
| 1981-01-30 00:00:00 | 0.141768 | 0.141316 | 6.08989e+08 | 0.155134 | 0.126116 | |
||||
| 1981-02-27 00:00:00 | 0.118215 | 0.117892 | 3.21619e+08 | 0.128906 | 0.106027 | |
||||
| 1981-03-31 00:00:00 | 0.111328 | 0.110871 | 7.00717e+08 | 0.120536 | 0.09654 | |
||||
| 1981-04-30 00:00:00 | 0.121811 | 0.121545 | 5.36928e+08 | 0.131138 | 0.108259 | |
||||
|
||||
To get this result there are two ways: `resample` and `groupby`. There are two key steps: |
||||
|
||||
- Find how to affect the agregation on the last **business** day of each month. This is already implemented in Pandas and the keyword that should be used either in `resample` parameter or in `Grouper` is `BM`. |
||||
- Choose the right agregation function for each variable. The prices (Open, Close and Adjusted Close) should be agregated by taking the `mean`. Low should be agregated by taking the `minimum` because it represents the lower price of the day, so the lowest price on the month is the lowest price of the lowest prices on the day. The same logic applied to High, leads to use the `maximum` to agregate the High. Volume should be agregated using the `sum` because the monthly volume is equal to the sum of daily volume over the month. |
||||
|
||||
There are **482 months**. |
||||
|
||||
3. The solution is accepted if it doesn't involve a for loop and the output is: |
||||
|
||||
``` |
||||
Date |
||||
1980-12-12 NaN |
||||
1980-12-15 -0.047823 |
||||
1980-12-16 -0.073063 |
||||
1980-12-17 0.019703 |
||||
1980-12-18 0.028992 |
||||
... |
||||
2021-01-25 0.049824 |
||||
2021-01-26 0.003704 |
||||
2021-01-27 -0.001184 |
||||
2021-01-28 -0.027261 |
||||
2021-01-29 -0.026448 |
||||
Name: Open, Length: 10118, dtype: float64 |
||||
``` |
||||
- The first way is to compute the return without for loop is to use `pct_change` |
||||
- The second way to compute the return without for loop is to implement the formula given in the exercice in a vectorized way. To get the value at `t-1` you can use `shift` |
||||
|
||||
# Exercice 3 Multi asset returns |
||||
|
||||
The goal of this exercice is to learn to compute daily returns on a DataFrame that contains many assets (multi-assets). |
||||
|
||||
``` |
||||
business_dates = pd.bdate_range('2021-01-01', '2021-12-31') |
||||
|
||||
#generate tickers |
||||
tickers = ['AAPL', 'FB', 'GE', 'AMZN', 'DAI'] |
||||
|
||||
#create indexs |
||||
index = pd.MultiIndex.from_product([business_dates, tickers], names=['Date', 'Ticker']) |
||||
|
||||
# create DFs |
||||
market_data = pd.DataFrame(index=index, |
||||
data=np.random.randn(len(index), 1), |
||||
columns=['Price']) |
||||
``` |
||||
1. **Without using a for loop**, compute the daily returns (return(d) = (price(d)-price(d-1))/price(d-1)) for all the companies and returns a DataFrame as: |
||||
|
||||
| Date | ('Price', 'AAPL') | ('Price', 'AMZN') | ('Price', 'DAI') | ('Price', 'FB') | ('Price', 'GE') | |
||||
|:--------------------|--------------------:|--------------------:|-------------------:|------------------:|------------------:| |
||||
| 2021-01-01 00:00:00 | nan | nan | nan | nan | nan | |
||||
| 2021-01-04 00:00:00 | 1.01793 | 0.0512955 | 3.84709 | -0.503488 | 0.33529 | |
||||
| 2021-01-05 00:00:00 | -0.222884 | -1.64623 | -0.71817 | -5.5036 | -4.15882 | |
||||
|
||||
Note: The data is generated randomly, the values you may have a different results. But, this shows the expected DataFrame structure. |
||||
|
||||
`Hint use groupby` |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if, without having used a for loop, the outputted DataFrame shape's `(261, 5)` and your output is the same as the one return with this line of code: |
||||
|
||||
``` |
||||
market_data.loc[market_data.index.get_level_values('Ticker')=='AAPL'].sort_index().pct_change() |
||||
|
||||
``` |
||||
The DataFrame contains random data. Make sure your output and the one returned by this code is based on the same DataFrame. |
||||
|
||||
|
||||
# Exercice 4 Backtest |
||||
|
||||
The goal of this exercice is to learn to perform a backtest in Pandas. A backtest is a tool that allows you to know how a strategy would have performed retrospectively using historical data. In this exercice we will focus on the backtesting tool and not on how to build the best strategy. |
||||
|
||||
We will backtest a **long only** strategy on Apple Inc. Long only means that we only consider buying the stock. The input signal at date d says if the close price will increase at d+1. We assume that the input signal is available before the market closes. |
||||
|
||||
|
||||
1. Drop the rows with missing values and compute the daily futur return on the Apple stock on the adjusted close price. The daily futur return means: **Return(t) = (Price(t+1) - Price(t))/Price(t)**. |
||||
There are some events as splits or dividents that artificially change the price of the stock. That is why the close price is adjusted to avoid to have outliers in the price data. |
||||
|
||||
2. Create a Series that contains a random boolean array with **p=0.5** |
||||
|
||||
``` |
||||
Here an example of the expected time series |
||||
2010-01-01 1 |
||||
2010-01-02 0 |
||||
2010-01-03 0 |
||||
2010-01-04 1 |
||||
2010-01-05 0 |
||||
Freq: D, Name: long_only_signal, dtype: int64 |
||||
``` |
||||
- The information is this series should be interpreted this way: |
||||
- On the 2010-01-01 I receive `1` before the market closes meaning that, if I trust the signal, the close price of day d+1 will increase. I should buy the stock before the market closes. |
||||
- On the 2010-01-02 I receive `0` before the market closes meaning that,, if I trust the signal, the close price of day d+1 will not increase. I should not buy the stock. |
||||
|
||||
|
||||
3. Backtest the signal created in Question 2. Here are some assumptions made to backtest this signal: |
||||
- When, at date d, the signal equals 1 we buy 1$ of stock just before the market closes and we sell the stock just before the market closes the next day. |
||||
- When, at date d, the signal equals 0, we do not buy anything. |
||||
- The profit is not reinvested, when invested, the amount is always 1$. |
||||
- Fees are not considered |
||||
|
||||
**The expected output** is a **Series that gives for each day the return of the strategy. The return of the strategy is the PnL (Profit and Losses) divided by the invested amount**. The PnL for day d is: |
||||
(money earned this day - money invested this day) |
||||
Let's take the example of a 20% return for an invested amount of 1$. The PnL is (1,2 - 1) = 0.2. We notice that the PnL when the signal is 1 equals the daily return. The Pnl when the signal is 0 is 0. |
||||
By convention, we consider that the PnL of d is affected to day d and not d+1, even if the underlying return contains the information of d+1. |
||||
|
||||
**The usage of for loop is not allowed**. |
||||
|
||||
4. Compute the return of the strategy. The return of the strategy is defined as: (Total earned - Total invested) / Total invested |
||||
|
||||
5. Now the input signal is: **always buy**. Compute the daily PnL and the total PnL. Plot the daily PnL of Q5 and of Q3 on the same plot |
||||
|
||||
https://www.investopedia.com/terms/b/backtesting.asp |
||||
|
||||
|
||||
## Correction |
||||
Preliminary: |
||||
|
||||
- As usual the first steps are: |
||||
|
||||
- Check missing values and data types |
||||
- Convert string dates to datetime |
||||
- Set dates as index |
||||
- Use `info` or `describe` to have a first look at the data |
||||
|
||||
The exerice is not validated if these steps haven't been done. |
||||
|
||||
My results can be reproduced using: `np.random.seed = 2712`. Given the versions of NumPy used I do not garanty the reproducibilty of the results - that is why I also explain the steps to get to the solution. |
||||
|
||||
|
||||
1. This question is validated if the return is computed as: Return(t) = (Price(t+1) - Price(t))/Price(t) and returns this output. |
||||
|
||||
``` |
||||
Date |
||||
1980-12-12 -0.052170 |
||||
1980-12-15 -0.073403 |
||||
1980-12-16 0.024750 |
||||
1980-12-17 0.029000 |
||||
1980-12-18 0.061024 |
||||
... |
||||
2021-01-25 0.001679 |
||||
2021-01-26 -0.007684 |
||||
2021-01-27 -0.034985 |
||||
2021-01-28 -0.037421 |
||||
2021-01-29 NaN |
||||
Name: Daily_futur_returns, Length: 10118, dtype: float64 |
||||
|
||||
``` |
||||
The answer is also accepted if the returns is computed as in the exercice 2 and then shifted in the futur using `shift`, but I do not recommend this implementation as it adds missing values ! |
||||
An example of solution is: |
||||
|
||||
|
||||
``` |
||||
def compute_futur_return(price): |
||||
return (price.shift(-1) - price)/price |
||||
|
||||
compute_futur_return(df['Adj Close']) |
||||
``` |
||||
Note that if the index is not ordered in ascending order the futur return computed is wrong. |
||||
|
||||
2. This question is validated if the index of the Series is the same as the index of the DataFrame. The data of the series can be generated using `np.random.randint(0,2,len(df.index)`. |
||||
|
||||
3. This question is validated if the Pnl is computed as: signal * futur_return. Both series should have the same index. |
||||
|
||||
``` |
||||
Date |
||||
1980-12-12 -0.052170 |
||||
1980-12-15 -0.073403 |
||||
1980-12-16 0.024750 |
||||
1980-12-17 0.029000 |
||||
1980-12-18 0.061024 |
||||
... |
||||
2021-01-25 0.001679 |
||||
2021-01-26 -0.007684 |
||||
2021-01-27 -0.034985 |
||||
2021-01-28 -0.037421 |
||||
2021-01-29 NaN |
||||
Name: PnL, Length: 10119, dtype: float64 |
||||
``` |
||||
|
||||
4. The question is validated if you computed the return of the strategy as: (Total earned - Total invested) / Total invested. The result should be close to 0. The formula given could be simplified as `(PnLs.sum())/signal.sum()`. |
||||
My return is: 0.00043546984088551553 because I invested 5147$ and I earned 5149$. |
||||
|
||||
|
||||
5. The question is validated if you replaced the previous signal Series with 1s. Similarly as the previous question, we earned 10128$ and we invested 10118$ which leads to a return of 0.00112670194140969 (0.1%). |
||||
|
After Width: | Height: | Size: 9.5 KiB |
After Width: | Height: | Size: 11 KiB |
After Width: | Height: | Size: 27 KiB |
After Width: | Height: | Size: 18 KiB |
After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 43 KiB |
After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 136 KiB |
@ -0,0 +1,164 @@
|
||||
# D0607 Piscine AI - Data Science |
||||
|
||||
## SP data preprocessing |
||||
|
||||
The goal of this project is to perform a Backtest on the SP500 constituents. The SP500 is an index the 500 biggest capitalization in the US. |
||||
|
||||
|
||||
## Data |
||||
|
||||
The input file are `stock_prices.csv` and : |
||||
- `sp500.csv` contains the SP500 data. The SP500 is a stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the United States. |
||||
- `stock_prices.csv`: contains the close prices for all the companies that had been in the SP500. It contains a lot of missing data. The adjusted close price may be unavailable for three main reasons: |
||||
|
||||
- The company doesn't exist at date d |
||||
- The company is not public, pas coté |
||||
- Its close price hasn't been reported |
||||
- Note: The quality of this data set is not good: some prices are wrong, there are some prices spikes, there are some prices adjusments (share split, dividend distribution) - the prices adjusment are corrected in the adjusted close. But I'm not providing this data for this project to let you understand what is bad quality data and how important it is to detect outliers and missing values. The idea is not to correct the full the data set manually but to correct the main problems. |
||||
|
||||
*Note: The corrections won't fix the data, as a result the results may be abnormal compared to results from cleaned financial data. That's not a problem for this small project ! * |
||||
|
||||
## Problem |
||||
|
||||
Once, preprocessed, this data will be used to generate a signal that is, for each asset at each date a metric that indicates if the asset price will increase the next month. At each date (once a month) we will take the 20 highest metrics and invest 1$ per company. This strategy is called stock picking. It consists in picking stock in an index and try to overperfom the index. Finally we will compare the performance of our strategy compared to the benchmark: the SP500. |
||||
|
||||
|
||||
|
||||
It is important to understand that the SP500 components change over time. The reason is simple: Facebook entered the SP500 in ???? and as there are 500 companies |
||||
|
||||
|
||||
|
||||
The structure of the project is: |
||||
|
||||
``` |
||||
project |
||||
│ README.md |
||||
│ environment.yml |
||||
│ |
||||
└───data |
||||
│ │ sp500.csv |
||||
│ | prices.csv |
||||
│ |
||||
└───notebook |
||||
│ │ analysis.ipynb |
||||
| |
||||
|───scripts |
||||
| │ memory_reducer.py |
||||
| │ preprocessing.py |
||||
| │ create_signal.py |
||||
| | backtester.py |
||||
│ | main.py |
||||
│ |
||||
└───results |
||||
│ plots |
||||
│ results.txt |
||||
│ outliers.txt |
||||
|
||||
``` |
||||
|
||||
There are four parts: |
||||
|
||||
## 1. Preliminary |
||||
|
||||
- Create a function that takes as input one CSV data file, optimizes the types to reduce its size and returns a memory optimized DataFrame. |
||||
- For float data the smaller data type used is `np.float32` |
||||
- These steps may help you to implement the memory_reducer: |
||||
|
||||
1. Iterate over every column |
||||
2. Determine if the column is numeric |
||||
3. Determine if the column can be represented by an integer |
||||
4. Find the min and the max value |
||||
5. Determine and apply the smallest datatype that can fit the range of values |
||||
|
||||
|
||||
|
||||
## 2. Data wrangling and preprocessing: |
||||
|
||||
- Create a Jupyter Notebook to analyse the data sets and perform EDA (Exploratory Data Analysis). This notebook should contain at least: |
||||
- Missing values analysis |
||||
- Outliers analysis (there are a lot of outliers) |
||||
- One of average price for companies for all variables (save the plot with the images). |
||||
- Describe at least 5 outliers ('ticker', 'date', 'price'). Put them in `outliers.txt` file with the 3 fields on the folder `results`. |
||||
|
||||
*Note: create functions that generate the plots and save them in the images folder. Add a parameter `plot` with a defaut value `False` which doesn't return the plot. This will be useful for the correction to let people run your code without overriding your plots.* |
||||
|
||||
|
||||
- Here is how the `prices` data should be preprocessed: |
||||
- Resample data on month and keep the last value |
||||
- Filter prices outliers: Remove prices outside of the range 0.1$, 10k$ |
||||
- Compute montly returns: |
||||
- Historical returns. **returns(current month) = price(current month) - price(previous month) / price(previous month)** |
||||
- Futur returns. **returns(current month) = price(next month) - price(current month) / price(current month)** |
||||
- Replace returns outliers by the last value available regarding the company. This corrects prices spikes that corresponds to a monthly return greater than 1 and smaller than -0.5. This correction shouldn't consider the 2008 and 2009 period as the financial crisis impacted the market brutally. **Don't forget that a value is considered as an outlier comparing to the other returns/prices of the same company** |
||||
|
||||
At this stage the DataFrame should looks like this: |
||||
|
||||
| | Price | monthly_past_return | monthly_futur_return | |
||||
|:-----------------------------------------------------|---------:|----------------------:|-----------------------:| |
||||
| (Timestamp('2000-12-31 00:00:00', freq='M'), 'A') | 36.7304 | nan | -0.00365297 | |
||||
| (Timestamp('2000-12-31 00:00:00', freq='M'), 'AA') | 25.9505 | nan | 0.101194 | |
||||
| (Timestamp('2000-12-31 00:00:00', freq='M'), 'AAPL') | 1.00646 | nan | 0.452957 | |
||||
| (Timestamp('2000-12-31 00:00:00', freq='M'), 'ABC') | 11.4383 | nan | -0.0528713 | |
||||
| (Timestamp('2000-12-31 00:00:00', freq='M'), 'ABT') | 38.7945 | nan | -0.07205 | |
||||
1 |
||||
|
||||
- Fill the missing values using the last available value (same company) |
||||
- Drop the missing values that can't be filled |
||||
- Print `prices.isna().sum()` |
||||
|
||||
|
||||
- Here is how the `sp500.csv` data should be preprocessed: |
||||
- Resample data on month and keep the last value |
||||
- Compute historical monthly returns on the adjusted close |
||||
|
||||
|
||||
|
||||
## 3. Create signal |
||||
|
||||
At this stage we have a data set with features that we will leverage to get an investment signal. As previously said, we will focus on one single variable to create the signal: **monthly_past_return**. The signal will be the average of monthy returns of the previous year |
||||
|
||||
The naive assumption made here is that if a stock has performed well the last year it will perform well the next month. Moreover, we assume that we can buy stocks as soon as we have the signal (the signal is available at the close of day d and we assume that we can buy the stock at close of day d. The assumption is acceptable while considering monthly returns because the difference between the close of day d and the open of day d+1 is small comparing to the monthly return) |
||||
|
||||
- Create a column `average_return_1y` |
||||
- Create a column named `signal` that contains True if `average_return_1y` is among the 20 highest in the month `average_return_1y`. |
||||
|
||||
|
||||
## 4. Backtester |
||||
|
||||
At this stage we have an investment signal that indicates each month what are the 20 companies we should invest 1$ on (1$ each). In order to check the strategies' performance we will backtest our investment signal. |
||||
|
||||
- Compute the PnL and the total return of our strategy without a for loop. Save the results in a text file `results.txt` in the folder `results`. |
||||
- Compute the PnL and the total return of the strategy that consists in investing 20$ each day on the SP500. Compare. Save the results in a text file `results.txt` in the folder `results`. |
||||
- Create a plot that shows the performance of the strategy over time for the SP500 and the Stock Picking 20 strategy. |
||||
A data point (x-axis: date, y-axis: cumulated_return) is: the **cumulated returns** from the beginning of the strategy at date t. Save the plot in the results folder. |
||||
|
||||
This plot is used a lot in Finance because it helps to compare a custom strategy with in index. In that case we say that the SP500 is used as **benchmark** for the Stock Picking Strategy. |
||||
|
||||
|
||||
![alt text][performance] |
||||
|
||||
[performance]: images/weekend/w1_weekend_plot_pnl.png "Cumulative Performance" |
||||
|
||||
## 5. Main |
||||
|
||||
Here is a sketch of `main.py`. |
||||
|
||||
``` |
||||
main.py |
||||
|
||||
|
||||
# import data |
||||
prices, sp500 = memory_reducer(paths) |
||||
|
||||
# preprocessing |
||||
prices, sp500 = preprocessing(prices, sp500) |
||||
|
||||
# create signal |
||||
prices = create_signal(prices) |
||||
|
||||
#backtest |
||||
|
||||
backtest(prices, sp500) |
||||
``` |
||||
|
||||
**The command `python main.py` executes the code from data imports to the backtest and save the results.** |
@ -0,0 +1,97 @@
|
||||
|
||||
``` |
||||
project |
||||
│ README.md |
||||
│ environment.yml |
||||
│ |
||||
└───data |
||||
│ │ sp500.csv |
||||
│ | prices.csv |
||||
│ |
||||
└───notebook |
||||
│ │ analysis.ipynb |
||||
| |
||||
|───scripts |
||||
| │ memory_reducer.py |
||||
| │ preprocessing.py |
||||
| │ create_signal.py |
||||
| | backtester.py |
||||
│ | main.py |
||||
│ |
||||
└───results |
||||
│ plots |
||||
│ results.txt |
||||
│ outliers.txt |
||||
|
||||
``` |
||||
- The readme file contains a description of the project and explains how to run the code from an empty environment. It also gives a summary of the implementation of each python file. The preprocessing which is a key part should be decribed precisely. Finally, it should contain a conclusion that gives the performance of the strategy. |
||||
|
||||
- The environment has to contain all libraries used and their versions that are necessary to run the code. |
||||
|
||||
- The notebook has to contain: |
||||
- Missing values analysis. **Example**: number of missing values per variables or per year |
||||
- Outliers analysis |
||||
- Histogram of average price for companies for all variables (save the plot with the images). This is required only for `prices.csv` data. |
||||
- Describe at least 5 outliers ('ticker', 'date', price). To check the outliers it is simple. Search the historical stock price on Google at the given date and compare. The price may fluctuate a bit. The goal here is not to match the historical price found on Google but to detect a huge difference between the price in our data and the real historical one. |
||||
|
||||
|
||||
Notes: |
||||
- For all questions always check the values are sorted by date. If not the answers are wrong. |
||||
- The plots are validated only if they contain a title |
||||
|
||||
## Python files |
||||
### 1. memory_reducer.py |
||||
|
||||
The memory_reducer is validated if: |
||||
|
||||
- The `prices` data set weights less than **8MB** (Mega Bytes) |
||||
- The `sp500` data set weights less than **0.15MB** (Mega Bytes) |
||||
- For float data the smaller data type used is np.float32. Smaller data type may alter the precision of the data. |
||||
|
||||
|
||||
### 2. preprocessing.py |
||||
|
||||
The preprocessing is validated if: |
||||
|
||||
|
||||
#### Prices |
||||
|
||||
- The data is agregated on a monthly period and only the last element is kept |
||||
- The outliers are filtered out by removing all prices bigger than 10k $ and smaller than 0.1 $ |
||||
- The historical return is computed using only current and past values. |
||||
- The futur return is computed using only current and futur value. (Reminder: as the data is resampled monthly, computing the return is straightforward) |
||||
- The outliers in the returns data is set to NaN for all returns not in the years 2008 and 2009. The filters are: return > 1 and return < -0.5. |
||||
- The missing values are filled using the last value available **for the company**. `df.fillna(method='ffill')` is wrong because the previous value can be the return or price of another company. |
||||
- The missing values that can't be filled using a the previous existing value are dropped. |
||||
- The number of missing values is 0 |
||||
|
||||
Best practice: |
||||
|
||||
Do not fill the last values for the futur return because the values are missing because the data set ends at a given date. Filling the previous doesn't make sense. It makes more sense to drop the row because the backtest focuses on observed data. |
||||
|
||||
|
||||
### 3. create_signal.py |
||||
|
||||
The signal creation is validated if: |
||||
|
||||
- The metric `average_return_1y` is added as a new column if the merged DataFrame. The metric is relative to a company. It is important to group the data by company first before to compute the average return over 1y. It is accepted to consider that one year is 12 consecutive rows. |
||||
- The signal is added as a new column to the merged DataFrame. The signal which is boolean indicates wether, within the same month, the company is in the top 20. The top 20 corresponds to the 20 companies with the 20 highest metric within the same month. The highest metric gets the rank 1 (if rank is used the parameter `ascending` should be set to `False`). |
||||
|
||||
### 4. backtester.py |
||||
|
||||
The backtester is validated if: |
||||
|
||||
- The PnL is computed by multiplying the signal `Series` by the **futur returns**. |
||||
- The return of the strategy is computed by dividing the PnL by the sum of the signal `Series`. |
||||
- The signal used on the SP500 is the pd.Series([20,20,...,20]) |
||||
- The series used in the plot are the cumulative PnL. `cumsum` can be used. |
||||
- The PnL on the full historical data is **smaller than 75$**. If not, it means that the outliers where not corrected correctly. |
||||
|
||||
![alt text][performance] |
||||
|
||||
[performance]: images/weekend/w1_weekend_plot_pnl.png "Cumulative Performance" |
||||
|
||||
|
||||
### 5. main.py |
||||
|
||||
**The command `python main.py` executes the code from data imports to the backtest and save the results.** It shouldn't return any error to validate the project. |
@ -0,0 +1,699 @@
|
||||
1000025,5,1,1,1,2,1,3,1,1,2 |
||||
1002945,5,4,4,5,7,10,3,2,1,2 |
||||
1015425,3,1,1,1,2,2,3,1,1,2 |
||||
1016277,6,8,8,1,3,4,3,7,1,2 |
||||
1017023,4,1,1,3,2,1,3,1,1,2 |
||||
1017122,8,10,10,8,7,10,9,7,1,4 |
||||
1018099,1,1,1,1,2,10,3,1,1,2 |
||||
1018561,2,1,2,1,2,1,3,1,1,2 |
||||
1033078,2,1,1,1,2,1,1,1,5,2 |
||||
1033078,4,2,1,1,2,1,2,1,1,2 |
||||
1035283,1,1,1,1,1,1,3,1,1,2 |
||||
1036172,2,1,1,1,2,1,2,1,1,2 |
||||
1041801,5,3,3,3,2,3,4,4,1,4 |
||||
1043999,1,1,1,1,2,3,3,1,1,2 |
||||
1044572,8,7,5,10,7,9,5,5,4,4 |
||||
1047630,7,4,6,4,6,1,4,3,1,4 |
||||
1048672,4,1,1,1,2,1,2,1,1,2 |
||||
1049815,4,1,1,1,2,1,3,1,1,2 |
||||
1050670,10,7,7,6,4,10,4,1,2,4 |
||||
1050718,6,1,1,1,2,1,3,1,1,2 |
||||
1054590,7,3,2,10,5,10,5,4,4,4 |
||||
1054593,10,5,5,3,6,7,7,10,1,4 |
||||
1056784,3,1,1,1,2,1,2,1,1,2 |
||||
1057013,8,4,5,1,2,?,7,3,1,4 |
||||
1059552,1,1,1,1,2,1,3,1,1,2 |
||||
1065726,5,2,3,4,2,7,3,6,1,4 |
||||
1066373,3,2,1,1,1,1,2,1,1,2 |
||||
1066979,5,1,1,1,2,1,2,1,1,2 |
||||
1067444,2,1,1,1,2,1,2,1,1,2 |
||||
1070935,1,1,3,1,2,1,1,1,1,2 |
||||
1070935,3,1,1,1,1,1,2,1,1,2 |
||||
1071760,2,1,1,1,2,1,3,1,1,2 |
||||
1072179,10,7,7,3,8,5,7,4,3,4 |
||||
1074610,2,1,1,2,2,1,3,1,1,2 |
||||
1075123,3,1,2,1,2,1,2,1,1,2 |
||||
1079304,2,1,1,1,2,1,2,1,1,2 |
||||
1080185,10,10,10,8,6,1,8,9,1,4 |
||||
1081791,6,2,1,1,1,1,7,1,1,2 |
||||
1084584,5,4,4,9,2,10,5,6,1,4 |
||||
1091262,2,5,3,3,6,7,7,5,1,4 |
||||
1096800,6,6,6,9,6,?,7,8,1,2 |
||||
1099510,10,4,3,1,3,3,6,5,2,4 |
||||
1100524,6,10,10,2,8,10,7,3,3,4 |
||||
1102573,5,6,5,6,10,1,3,1,1,4 |
||||
1103608,10,10,10,4,8,1,8,10,1,4 |
||||
1103722,1,1,1,1,2,1,2,1,2,2 |
||||
1105257,3,7,7,4,4,9,4,8,1,4 |
||||
1105524,1,1,1,1,2,1,2,1,1,2 |
||||
1106095,4,1,1,3,2,1,3,1,1,2 |
||||
1106829,7,8,7,2,4,8,3,8,2,4 |
||||
1108370,9,5,8,1,2,3,2,1,5,4 |
||||
1108449,5,3,3,4,2,4,3,4,1,4 |
||||
1110102,10,3,6,2,3,5,4,10,2,4 |
||||
1110503,5,5,5,8,10,8,7,3,7,4 |
||||
1110524,10,5,5,6,8,8,7,1,1,4 |
||||
1111249,10,6,6,3,4,5,3,6,1,4 |
||||
1112209,8,10,10,1,3,6,3,9,1,4 |
||||
1113038,8,2,4,1,5,1,5,4,4,4 |
||||
1113483,5,2,3,1,6,10,5,1,1,4 |
||||
1113906,9,5,5,2,2,2,5,1,1,4 |
||||
1115282,5,3,5,5,3,3,4,10,1,4 |
||||
1115293,1,1,1,1,2,2,2,1,1,2 |
||||
1116116,9,10,10,1,10,8,3,3,1,4 |
||||
1116132,6,3,4,1,5,2,3,9,1,4 |
||||
1116192,1,1,1,1,2,1,2,1,1,2 |
||||
1116998,10,4,2,1,3,2,4,3,10,4 |
||||
1117152,4,1,1,1,2,1,3,1,1,2 |
||||
1118039,5,3,4,1,8,10,4,9,1,4 |
||||
1120559,8,3,8,3,4,9,8,9,8,4 |
||||
1121732,1,1,1,1,2,1,3,2,1,2 |
||||
1121919,5,1,3,1,2,1,2,1,1,2 |
||||
1123061,6,10,2,8,10,2,7,8,10,4 |
||||
1124651,1,3,3,2,2,1,7,2,1,2 |
||||
1125035,9,4,5,10,6,10,4,8,1,4 |
||||
1126417,10,6,4,1,3,4,3,2,3,4 |
||||
1131294,1,1,2,1,2,2,4,2,1,2 |
||||
1132347,1,1,4,1,2,1,2,1,1,2 |
||||
1133041,5,3,1,2,2,1,2,1,1,2 |
||||
1133136,3,1,1,1,2,3,3,1,1,2 |
||||
1136142,2,1,1,1,3,1,2,1,1,2 |
||||
1137156,2,2,2,1,1,1,7,1,1,2 |
||||
1143978,4,1,1,2,2,1,2,1,1,2 |
||||
1143978,5,2,1,1,2,1,3,1,1,2 |
||||
1147044,3,1,1,1,2,2,7,1,1,2 |
||||
1147699,3,5,7,8,8,9,7,10,7,4 |
||||
1147748,5,10,6,1,10,4,4,10,10,4 |
||||
1148278,3,3,6,4,5,8,4,4,1,4 |
||||
1148873,3,6,6,6,5,10,6,8,3,4 |
||||
1152331,4,1,1,1,2,1,3,1,1,2 |
||||
1155546,2,1,1,2,3,1,2,1,1,2 |
||||
1156272,1,1,1,1,2,1,3,1,1,2 |
||||
1156948,3,1,1,2,2,1,1,1,1,2 |
||||
1157734,4,1,1,1,2,1,3,1,1,2 |
||||
1158247,1,1,1,1,2,1,2,1,1,2 |
||||
1160476,2,1,1,1,2,1,3,1,1,2 |
||||
1164066,1,1,1,1,2,1,3,1,1,2 |
||||
1165297,2,1,1,2,2,1,1,1,1,2 |
||||
1165790,5,1,1,1,2,1,3,1,1,2 |
||||
1165926,9,6,9,2,10,6,2,9,10,4 |
||||
1166630,7,5,6,10,5,10,7,9,4,4 |
||||
1166654,10,3,5,1,10,5,3,10,2,4 |
||||
1167439,2,3,4,4,2,5,2,5,1,4 |
||||
1167471,4,1,2,1,2,1,3,1,1,2 |
||||
1168359,8,2,3,1,6,3,7,1,1,4 |
||||
1168736,10,10,10,10,10,1,8,8,8,4 |
||||
1169049,7,3,4,4,3,3,3,2,7,4 |
||||
1170419,10,10,10,8,2,10,4,1,1,4 |
||||
1170420,1,6,8,10,8,10,5,7,1,4 |
||||
1171710,1,1,1,1,2,1,2,3,1,2 |
||||
1171710,6,5,4,4,3,9,7,8,3,4 |
||||
1171795,1,3,1,2,2,2,5,3,2,2 |
||||
1171845,8,6,4,3,5,9,3,1,1,4 |
||||
1172152,10,3,3,10,2,10,7,3,3,4 |
||||
1173216,10,10,10,3,10,8,8,1,1,4 |
||||
1173235,3,3,2,1,2,3,3,1,1,2 |
||||
1173347,1,1,1,1,2,5,1,1,1,2 |
||||
1173347,8,3,3,1,2,2,3,2,1,2 |
||||
1173509,4,5,5,10,4,10,7,5,8,4 |
||||
1173514,1,1,1,1,4,3,1,1,1,2 |
||||
1173681,3,2,1,1,2,2,3,1,1,2 |
||||
1174057,1,1,2,2,2,1,3,1,1,2 |
||||
1174057,4,2,1,1,2,2,3,1,1,2 |
||||
1174131,10,10,10,2,10,10,5,3,3,4 |
||||
1174428,5,3,5,1,8,10,5,3,1,4 |
||||
1175937,5,4,6,7,9,7,8,10,1,4 |
||||
1176406,1,1,1,1,2,1,2,1,1,2 |
||||
1176881,7,5,3,7,4,10,7,5,5,4 |
||||
1177027,3,1,1,1,2,1,3,1,1,2 |
||||
1177399,8,3,5,4,5,10,1,6,2,4 |
||||
1177512,1,1,1,1,10,1,1,1,1,2 |
||||
1178580,5,1,3,1,2,1,2,1,1,2 |
||||
1179818,2,1,1,1,2,1,3,1,1,2 |
||||
1180194,5,10,8,10,8,10,3,6,3,4 |
||||
1180523,3,1,1,1,2,1,2,2,1,2 |
||||
1180831,3,1,1,1,3,1,2,1,1,2 |
||||
1181356,5,1,1,1,2,2,3,3,1,2 |
||||
1182404,4,1,1,1,2,1,2,1,1,2 |
||||
1182410,3,1,1,1,2,1,1,1,1,2 |
||||
1183240,4,1,2,1,2,1,2,1,1,2 |
||||
1183246,1,1,1,1,1,?,2,1,1,2 |
||||
1183516,3,1,1,1,2,1,1,1,1,2 |
||||
1183911,2,1,1,1,2,1,1,1,1,2 |
||||
1183983,9,5,5,4,4,5,4,3,3,4 |
||||
1184184,1,1,1,1,2,5,1,1,1,2 |
||||
1184241,2,1,1,1,2,1,2,1,1,2 |
||||
1184840,1,1,3,1,2,?,2,1,1,2 |
||||
1185609,3,4,5,2,6,8,4,1,1,4 |
||||
1185610,1,1,1,1,3,2,2,1,1,2 |
||||
1187457,3,1,1,3,8,1,5,8,1,2 |
||||
1187805,8,8,7,4,10,10,7,8,7,4 |
||||
1188472,1,1,1,1,1,1,3,1,1,2 |
||||
1189266,7,2,4,1,6,10,5,4,3,4 |
||||
1189286,10,10,8,6,4,5,8,10,1,4 |
||||
1190394,4,1,1,1,2,3,1,1,1,2 |
||||
1190485,1,1,1,1,2,1,1,1,1,2 |
||||
1192325,5,5,5,6,3,10,3,1,1,4 |
||||
1193091,1,2,2,1,2,1,2,1,1,2 |
||||
1193210,2,1,1,1,2,1,3,1,1,2 |
||||
1193683,1,1,2,1,3,?,1,1,1,2 |
||||
1196295,9,9,10,3,6,10,7,10,6,4 |
||||
1196915,10,7,7,4,5,10,5,7,2,4 |
||||
1197080,4,1,1,1,2,1,3,2,1,2 |
||||
1197270,3,1,1,1,2,1,3,1,1,2 |
||||
1197440,1,1,1,2,1,3,1,1,7,2 |
||||
1197510,5,1,1,1,2,?,3,1,1,2 |
||||
1197979,4,1,1,1,2,2,3,2,1,2 |
||||
1197993,5,6,7,8,8,10,3,10,3,4 |
||||
1198128,10,8,10,10,6,1,3,1,10,4 |
||||
1198641,3,1,1,1,2,1,3,1,1,2 |
||||
1199219,1,1,1,2,1,1,1,1,1,2 |
||||
1199731,3,1,1,1,2,1,1,1,1,2 |
||||
1199983,1,1,1,1,2,1,3,1,1,2 |
||||
1200772,1,1,1,1,2,1,2,1,1,2 |
||||
1200847,6,10,10,10,8,10,10,10,7,4 |
||||
1200892,8,6,5,4,3,10,6,1,1,4 |
||||
1200952,5,8,7,7,10,10,5,7,1,4 |
||||
1201834,2,1,1,1,2,1,3,1,1,2 |
||||
1201936,5,10,10,3,8,1,5,10,3,4 |
||||
1202125,4,1,1,1,2,1,3,1,1,2 |
||||
1202812,5,3,3,3,6,10,3,1,1,4 |
||||
1203096,1,1,1,1,1,1,3,1,1,2 |
||||
1204242,1,1,1,1,2,1,1,1,1,2 |
||||
1204898,6,1,1,1,2,1,3,1,1,2 |
||||
1205138,5,8,8,8,5,10,7,8,1,4 |
||||
1205579,8,7,6,4,4,10,5,1,1,4 |
||||
1206089,2,1,1,1,1,1,3,1,1,2 |
||||
1206695,1,5,8,6,5,8,7,10,1,4 |
||||
1206841,10,5,6,10,6,10,7,7,10,4 |
||||
1207986,5,8,4,10,5,8,9,10,1,4 |
||||
1208301,1,2,3,1,2,1,3,1,1,2 |
||||
1210963,10,10,10,8,6,8,7,10,1,4 |
||||
1211202,7,5,10,10,10,10,4,10,3,4 |
||||
1212232,5,1,1,1,2,1,2,1,1,2 |
||||
1212251,1,1,1,1,2,1,3,1,1,2 |
||||
1212422,3,1,1,1,2,1,3,1,1,2 |
||||
1212422,4,1,1,1,2,1,3,1,1,2 |
||||
1213375,8,4,4,5,4,7,7,8,2,2 |
||||
1213383,5,1,1,4,2,1,3,1,1,2 |
||||
1214092,1,1,1,1,2,1,1,1,1,2 |
||||
1214556,3,1,1,1,2,1,2,1,1,2 |
||||
1214966,9,7,7,5,5,10,7,8,3,4 |
||||
1216694,10,8,8,4,10,10,8,1,1,4 |
||||
1216947,1,1,1,1,2,1,3,1,1,2 |
||||
1217051,5,1,1,1,2,1,3,1,1,2 |
||||
1217264,1,1,1,1,2,1,3,1,1,2 |
||||
1218105,5,10,10,9,6,10,7,10,5,4 |
||||
1218741,10,10,9,3,7,5,3,5,1,4 |
||||
1218860,1,1,1,1,1,1,3,1,1,2 |
||||
1218860,1,1,1,1,1,1,3,1,1,2 |
||||
1219406,5,1,1,1,1,1,3,1,1,2 |
||||
1219525,8,10,10,10,5,10,8,10,6,4 |
||||
1219859,8,10,8,8,4,8,7,7,1,4 |
||||
1220330,1,1,1,1,2,1,3,1,1,2 |
||||
1221863,10,10,10,10,7,10,7,10,4,4 |
||||
1222047,10,10,10,10,3,10,10,6,1,4 |
||||
1222936,8,7,8,7,5,5,5,10,2,4 |
||||
1223282,1,1,1,1,2,1,2,1,1,2 |
||||
1223426,1,1,1,1,2,1,3,1,1,2 |
||||
1223793,6,10,7,7,6,4,8,10,2,4 |
||||
1223967,6,1,3,1,2,1,3,1,1,2 |
||||
1224329,1,1,1,2,2,1,3,1,1,2 |
||||
1225799,10,6,4,3,10,10,9,10,1,4 |
||||
1226012,4,1,1,3,1,5,2,1,1,4 |
||||
1226612,7,5,6,3,3,8,7,4,1,4 |
||||
1227210,10,5,5,6,3,10,7,9,2,4 |
||||
1227244,1,1,1,1,2,1,2,1,1,2 |
||||
1227481,10,5,7,4,4,10,8,9,1,4 |
||||
1228152,8,9,9,5,3,5,7,7,1,4 |
||||
1228311,1,1,1,1,1,1,3,1,1,2 |
||||
1230175,10,10,10,3,10,10,9,10,1,4 |
||||
1230688,7,4,7,4,3,7,7,6,1,4 |
||||
1231387,6,8,7,5,6,8,8,9,2,4 |
||||
1231706,8,4,6,3,3,1,4,3,1,2 |
||||
1232225,10,4,5,5,5,10,4,1,1,4 |
||||
1236043,3,3,2,1,3,1,3,6,1,2 |
||||
1241232,3,1,4,1,2,?,3,1,1,2 |
||||
1241559,10,8,8,2,8,10,4,8,10,4 |
||||
1241679,9,8,8,5,6,2,4,10,4,4 |
||||
1242364,8,10,10,8,6,9,3,10,10,4 |
||||
1243256,10,4,3,2,3,10,5,3,2,4 |
||||
1270479,5,1,3,3,2,2,2,3,1,2 |
||||
1276091,3,1,1,3,1,1,3,1,1,2 |
||||
1277018,2,1,1,1,2,1,3,1,1,2 |
||||
128059,1,1,1,1,2,5,5,1,1,2 |
||||
1285531,1,1,1,1,2,1,3,1,1,2 |
||||
1287775,5,1,1,2,2,2,3,1,1,2 |
||||
144888,8,10,10,8,5,10,7,8,1,4 |
||||
145447,8,4,4,1,2,9,3,3,1,4 |
||||
167528,4,1,1,1,2,1,3,6,1,2 |
||||
169356,3,1,1,1,2,?,3,1,1,2 |
||||
183913,1,2,2,1,2,1,1,1,1,2 |
||||
191250,10,4,4,10,2,10,5,3,3,4 |
||||
1017023,6,3,3,5,3,10,3,5,3,2 |
||||
1100524,6,10,10,2,8,10,7,3,3,4 |
||||
1116116,9,10,10,1,10,8,3,3,1,4 |
||||
1168736,5,6,6,2,4,10,3,6,1,4 |
||||
1182404,3,1,1,1,2,1,1,1,1,2 |
||||
1182404,3,1,1,1,2,1,2,1,1,2 |
||||
1198641,3,1,1,1,2,1,3,1,1,2 |
||||
242970,5,7,7,1,5,8,3,4,1,2 |
||||
255644,10,5,8,10,3,10,5,1,3,4 |
||||
263538,5,10,10,6,10,10,10,6,5,4 |
||||
274137,8,8,9,4,5,10,7,8,1,4 |
||||
303213,10,4,4,10,6,10,5,5,1,4 |
||||
314428,7,9,4,10,10,3,5,3,3,4 |
||||
1182404,5,1,4,1,2,1,3,2,1,2 |
||||
1198641,10,10,6,3,3,10,4,3,2,4 |
||||
320675,3,3,5,2,3,10,7,1,1,4 |
||||
324427,10,8,8,2,3,4,8,7,8,4 |
||||
385103,1,1,1,1,2,1,3,1,1,2 |
||||
390840,8,4,7,1,3,10,3,9,2,4 |
||||
411453,5,1,1,1,2,1,3,1,1,2 |
||||
320675,3,3,5,2,3,10,7,1,1,4 |
||||
428903,7,2,4,1,3,4,3,3,1,4 |
||||
431495,3,1,1,1,2,1,3,2,1,2 |
||||
432809,3,1,3,1,2,?,2,1,1,2 |
||||
434518,3,1,1,1,2,1,2,1,1,2 |
||||
452264,1,1,1,1,2,1,2,1,1,2 |
||||
456282,1,1,1,1,2,1,3,1,1,2 |
||||
476903,10,5,7,3,3,7,3,3,8,4 |
||||
486283,3,1,1,1,2,1,3,1,1,2 |
||||
486662,2,1,1,2,2,1,3,1,1,2 |
||||
488173,1,4,3,10,4,10,5,6,1,4 |
||||
492268,10,4,6,1,2,10,5,3,1,4 |
||||
508234,7,4,5,10,2,10,3,8,2,4 |
||||
527363,8,10,10,10,8,10,10,7,3,4 |
||||
529329,10,10,10,10,10,10,4,10,10,4 |
||||
535331,3,1,1,1,3,1,2,1,1,2 |
||||
543558,6,1,3,1,4,5,5,10,1,4 |
||||
555977,5,6,6,8,6,10,4,10,4,4 |
||||
560680,1,1,1,1,2,1,1,1,1,2 |
||||
561477,1,1,1,1,2,1,3,1,1,2 |
||||
563649,8,8,8,1,2,?,6,10,1,4 |
||||
601265,10,4,4,6,2,10,2,3,1,4 |
||||
606140,1,1,1,1,2,?,2,1,1,2 |
||||
606722,5,5,7,8,6,10,7,4,1,4 |
||||
616240,5,3,4,3,4,5,4,7,1,2 |
||||
61634,5,4,3,1,2,?,2,3,1,2 |
||||
625201,8,2,1,1,5,1,1,1,1,2 |
||||
63375,9,1,2,6,4,10,7,7,2,4 |
||||
635844,8,4,10,5,4,4,7,10,1,4 |
||||
636130,1,1,1,1,2,1,3,1,1,2 |
||||
640744,10,10,10,7,9,10,7,10,10,4 |
||||
646904,1,1,1,1,2,1,3,1,1,2 |
||||
653777,8,3,4,9,3,10,3,3,1,4 |
||||
659642,10,8,4,4,4,10,3,10,4,4 |
||||
666090,1,1,1,1,2,1,3,1,1,2 |
||||
666942,1,1,1,1,2,1,3,1,1,2 |
||||
667204,7,8,7,6,4,3,8,8,4,4 |
||||
673637,3,1,1,1,2,5,5,1,1,2 |
||||
684955,2,1,1,1,3,1,2,1,1,2 |
||||
688033,1,1,1,1,2,1,1,1,1,2 |
||||
691628,8,6,4,10,10,1,3,5,1,4 |
||||
693702,1,1,1,1,2,1,1,1,1,2 |
||||
704097,1,1,1,1,1,1,2,1,1,2 |
||||
704168,4,6,5,6,7,?,4,9,1,2 |
||||
706426,5,5,5,2,5,10,4,3,1,4 |
||||
709287,6,8,7,8,6,8,8,9,1,4 |
||||
718641,1,1,1,1,5,1,3,1,1,2 |
||||
721482,4,4,4,4,6,5,7,3,1,2 |
||||
730881,7,6,3,2,5,10,7,4,6,4 |
||||
733639,3,1,1,1,2,?,3,1,1,2 |
||||
733639,3,1,1,1,2,1,3,1,1,2 |
||||
733823,5,4,6,10,2,10,4,1,1,4 |
||||
740492,1,1,1,1,2,1,3,1,1,2 |
||||
743348,3,2,2,1,2,1,2,3,1,2 |
||||
752904,10,1,1,1,2,10,5,4,1,4 |
||||
756136,1,1,1,1,2,1,2,1,1,2 |
||||
760001,8,10,3,2,6,4,3,10,1,4 |
||||
760239,10,4,6,4,5,10,7,1,1,4 |
||||
76389,10,4,7,2,2,8,6,1,1,4 |
||||
764974,5,1,1,1,2,1,3,1,2,2 |
||||
770066,5,2,2,2,2,1,2,2,1,2 |
||||
785208,5,4,6,6,4,10,4,3,1,4 |
||||
785615,8,6,7,3,3,10,3,4,2,4 |
||||
792744,1,1,1,1,2,1,1,1,1,2 |
||||
797327,6,5,5,8,4,10,3,4,1,4 |
||||
798429,1,1,1,1,2,1,3,1,1,2 |
||||
704097,1,1,1,1,1,1,2,1,1,2 |
||||
806423,8,5,5,5,2,10,4,3,1,4 |
||||
809912,10,3,3,1,2,10,7,6,1,4 |
||||
810104,1,1,1,1,2,1,3,1,1,2 |
||||
814265,2,1,1,1,2,1,1,1,1,2 |
||||
814911,1,1,1,1,2,1,1,1,1,2 |
||||
822829,7,6,4,8,10,10,9,5,3,4 |
||||
826923,1,1,1,1,2,1,1,1,1,2 |
||||
830690,5,2,2,2,3,1,1,3,1,2 |
||||
831268,1,1,1,1,1,1,1,3,1,2 |
||||
832226,3,4,4,10,5,1,3,3,1,4 |
||||
832567,4,2,3,5,3,8,7,6,1,4 |
||||
836433,5,1,1,3,2,1,1,1,1,2 |
||||
837082,2,1,1,1,2,1,3,1,1,2 |
||||
846832,3,4,5,3,7,3,4,6,1,2 |
||||
850831,2,7,10,10,7,10,4,9,4,4 |
||||
855524,1,1,1,1,2,1,2,1,1,2 |
||||
857774,4,1,1,1,3,1,2,2,1,2 |
||||
859164,5,3,3,1,3,3,3,3,3,4 |
||||
859350,8,10,10,7,10,10,7,3,8,4 |
||||
866325,8,10,5,3,8,4,4,10,3,4 |
||||
873549,10,3,5,4,3,7,3,5,3,4 |
||||
877291,6,10,10,10,10,10,8,10,10,4 |
||||
877943,3,10,3,10,6,10,5,1,4,4 |
||||
888169,3,2,2,1,4,3,2,1,1,2 |
||||
888523,4,4,4,2,2,3,2,1,1,2 |
||||
896404,2,1,1,1,2,1,3,1,1,2 |
||||
897172,2,1,1,1,2,1,2,1,1,2 |
||||
95719,6,10,10,10,8,10,7,10,7,4 |
||||
160296,5,8,8,10,5,10,8,10,3,4 |
||||
342245,1,1,3,1,2,1,1,1,1,2 |
||||
428598,1,1,3,1,1,1,2,1,1,2 |
||||
492561,4,3,2,1,3,1,2,1,1,2 |
||||
493452,1,1,3,1,2,1,1,1,1,2 |
||||
493452,4,1,2,1,2,1,2,1,1,2 |
||||
521441,5,1,1,2,2,1,2,1,1,2 |
||||
560680,3,1,2,1,2,1,2,1,1,2 |
||||
636437,1,1,1,1,2,1,1,1,1,2 |
||||
640712,1,1,1,1,2,1,2,1,1,2 |
||||
654244,1,1,1,1,1,1,2,1,1,2 |
||||
657753,3,1,1,4,3,1,2,2,1,2 |
||||
685977,5,3,4,1,4,1,3,1,1,2 |
||||
805448,1,1,1,1,2,1,1,1,1,2 |
||||
846423,10,6,3,6,4,10,7,8,4,4 |
||||
1002504,3,2,2,2,2,1,3,2,1,2 |
||||
1022257,2,1,1,1,2,1,1,1,1,2 |
||||
1026122,2,1,1,1,2,1,1,1,1,2 |
||||
1071084,3,3,2,2,3,1,1,2,3,2 |
||||
1080233,7,6,6,3,2,10,7,1,1,4 |
||||
1114570,5,3,3,2,3,1,3,1,1,2 |
||||
1114570,2,1,1,1,2,1,2,2,1,2 |
||||
1116715,5,1,1,1,3,2,2,2,1,2 |
||||
1131411,1,1,1,2,2,1,2,1,1,2 |
||||
1151734,10,8,7,4,3,10,7,9,1,4 |
||||
1156017,3,1,1,1,2,1,2,1,1,2 |
||||
1158247,1,1,1,1,1,1,1,1,1,2 |
||||
1158405,1,2,3,1,2,1,2,1,1,2 |
||||
1168278,3,1,1,1,2,1,2,1,1,2 |
||||
1176187,3,1,1,1,2,1,3,1,1,2 |
||||
1196263,4,1,1,1,2,1,1,1,1,2 |
||||
1196475,3,2,1,1,2,1,2,2,1,2 |
||||
1206314,1,2,3,1,2,1,1,1,1,2 |
||||
1211265,3,10,8,7,6,9,9,3,8,4 |
||||
1213784,3,1,1,1,2,1,1,1,1,2 |
||||
1223003,5,3,3,1,2,1,2,1,1,2 |
||||
1223306,3,1,1,1,2,4,1,1,1,2 |
||||
1223543,1,2,1,3,2,1,1,2,1,2 |
||||
1229929,1,1,1,1,2,1,2,1,1,2 |
||||
1231853,4,2,2,1,2,1,2,1,1,2 |
||||
1234554,1,1,1,1,2,1,2,1,1,2 |
||||
1236837,2,3,2,2,2,2,3,1,1,2 |
||||
1237674,3,1,2,1,2,1,2,1,1,2 |
||||
1238021,1,1,1,1,2,1,2,1,1,2 |
||||
1238464,1,1,1,1,1,?,2,1,1,2 |
||||
1238633,10,10,10,6,8,4,8,5,1,4 |
||||
1238915,5,1,2,1,2,1,3,1,1,2 |
||||
1238948,8,5,6,2,3,10,6,6,1,4 |
||||
1239232,3,3,2,6,3,3,3,5,1,2 |
||||
1239347,8,7,8,5,10,10,7,2,1,4 |
||||
1239967,1,1,1,1,2,1,2,1,1,2 |
||||
1240337,5,2,2,2,2,2,3,2,2,2 |
||||
1253505,2,3,1,1,5,1,1,1,1,2 |
||||
1255384,3,2,2,3,2,3,3,1,1,2 |
||||
1257200,10,10,10,7,10,10,8,2,1,4 |
||||
1257648,4,3,3,1,2,1,3,3,1,2 |
||||
1257815,5,1,3,1,2,1,2,1,1,2 |
||||
1257938,3,1,1,1,2,1,1,1,1,2 |
||||
1258549,9,10,10,10,10,10,10,10,1,4 |
||||
1258556,5,3,6,1,2,1,1,1,1,2 |
||||
1266154,8,7,8,2,4,2,5,10,1,4 |
||||
1272039,1,1,1,1,2,1,2,1,1,2 |
||||
1276091,2,1,1,1,2,1,2,1,1,2 |
||||
1276091,1,3,1,1,2,1,2,2,1,2 |
||||
1276091,5,1,1,3,4,1,3,2,1,2 |
||||
1277629,5,1,1,1,2,1,2,2,1,2 |
||||
1293439,3,2,2,3,2,1,1,1,1,2 |
||||
1293439,6,9,7,5,5,8,4,2,1,2 |
||||
1294562,10,8,10,1,3,10,5,1,1,4 |
||||
1295186,10,10,10,1,6,1,2,8,1,4 |
||||
527337,4,1,1,1,2,1,1,1,1,2 |
||||
558538,4,1,3,3,2,1,1,1,1,2 |
||||
566509,5,1,1,1,2,1,1,1,1,2 |
||||
608157,10,4,3,10,4,10,10,1,1,4 |
||||
677910,5,2,2,4,2,4,1,1,1,2 |
||||
734111,1,1,1,3,2,3,1,1,1,2 |
||||
734111,1,1,1,1,2,2,1,1,1,2 |
||||
780555,5,1,1,6,3,1,2,1,1,2 |
||||
827627,2,1,1,1,2,1,1,1,1,2 |
||||
1049837,1,1,1,1,2,1,1,1,1,2 |
||||
1058849,5,1,1,1,2,1,1,1,1,2 |
||||
1182404,1,1,1,1,1,1,1,1,1,2 |
||||
1193544,5,7,9,8,6,10,8,10,1,4 |
||||
1201870,4,1,1,3,1,1,2,1,1,2 |
||||
1202253,5,1,1,1,2,1,1,1,1,2 |
||||
1227081,3,1,1,3,2,1,1,1,1,2 |
||||
1230994,4,5,5,8,6,10,10,7,1,4 |
||||
1238410,2,3,1,1,3,1,1,1,1,2 |
||||
1246562,10,2,2,1,2,6,1,1,2,4 |
||||
1257470,10,6,5,8,5,10,8,6,1,4 |
||||
1259008,8,8,9,6,6,3,10,10,1,4 |
||||
1266124,5,1,2,1,2,1,1,1,1,2 |
||||
1267898,5,1,3,1,2,1,1,1,1,2 |
||||
1268313,5,1,1,3,2,1,1,1,1,2 |
||||
1268804,3,1,1,1,2,5,1,1,1,2 |
||||
1276091,6,1,1,3,2,1,1,1,1,2 |
||||
1280258,4,1,1,1,2,1,1,2,1,2 |
||||
1293966,4,1,1,1,2,1,1,1,1,2 |
||||
1296572,10,9,8,7,6,4,7,10,3,4 |
||||
1298416,10,6,6,2,4,10,9,7,1,4 |
||||
1299596,6,6,6,5,4,10,7,6,2,4 |
||||
1105524,4,1,1,1,2,1,1,1,1,2 |
||||
1181685,1,1,2,1,2,1,2,1,1,2 |
||||
1211594,3,1,1,1,1,1,2,1,1,2 |
||||
1238777,6,1,1,3,2,1,1,1,1,2 |
||||
1257608,6,1,1,1,1,1,1,1,1,2 |
||||
1269574,4,1,1,1,2,1,1,1,1,2 |
||||
1277145,5,1,1,1,2,1,1,1,1,2 |
||||
1287282,3,1,1,1,2,1,1,1,1,2 |
||||
1296025,4,1,2,1,2,1,1,1,1,2 |
||||
1296263,4,1,1,1,2,1,1,1,1,2 |
||||
1296593,5,2,1,1,2,1,1,1,1,2 |
||||
1299161,4,8,7,10,4,10,7,5,1,4 |
||||
1301945,5,1,1,1,1,1,1,1,1,2 |
||||
1302428,5,3,2,4,2,1,1,1,1,2 |
||||
1318169,9,10,10,10,10,5,10,10,10,4 |
||||
474162,8,7,8,5,5,10,9,10,1,4 |
||||
787451,5,1,2,1,2,1,1,1,1,2 |
||||
1002025,1,1,1,3,1,3,1,1,1,2 |
||||
1070522,3,1,1,1,1,1,2,1,1,2 |
||||
1073960,10,10,10,10,6,10,8,1,5,4 |
||||
1076352,3,6,4,10,3,3,3,4,1,4 |
||||
1084139,6,3,2,1,3,4,4,1,1,4 |
||||
1115293,1,1,1,1,2,1,1,1,1,2 |
||||
1119189,5,8,9,4,3,10,7,1,1,4 |
||||
1133991,4,1,1,1,1,1,2,1,1,2 |
||||
1142706,5,10,10,10,6,10,6,5,2,4 |
||||
1155967,5,1,2,10,4,5,2,1,1,2 |
||||
1170945,3,1,1,1,1,1,2,1,1,2 |
||||
1181567,1,1,1,1,1,1,1,1,1,2 |
||||
1182404,4,2,1,1,2,1,1,1,1,2 |
||||
1204558,4,1,1,1,2,1,2,1,1,2 |
||||
1217952,4,1,1,1,2,1,2,1,1,2 |
||||
1224565,6,1,1,1,2,1,3,1,1,2 |
||||
1238186,4,1,1,1,2,1,2,1,1,2 |
||||
1253917,4,1,1,2,2,1,2,1,1,2 |
||||
1265899,4,1,1,1,2,1,3,1,1,2 |
||||
1268766,1,1,1,1,2,1,1,1,1,2 |
||||
1277268,3,3,1,1,2,1,1,1,1,2 |
||||
1286943,8,10,10,10,7,5,4,8,7,4 |
||||
1295508,1,1,1,1,2,4,1,1,1,2 |
||||
1297327,5,1,1,1,2,1,1,1,1,2 |
||||
1297522,2,1,1,1,2,1,1,1,1,2 |
||||
1298360,1,1,1,1,2,1,1,1,1,2 |
||||
1299924,5,1,1,1,2,1,2,1,1,2 |
||||
1299994,5,1,1,1,2,1,1,1,1,2 |
||||
1304595,3,1,1,1,1,1,2,1,1,2 |
||||
1306282,6,6,7,10,3,10,8,10,2,4 |
||||
1313325,4,10,4,7,3,10,9,10,1,4 |
||||
1320077,1,1,1,1,1,1,1,1,1,2 |
||||
1320077,1,1,1,1,1,1,2,1,1,2 |
||||
1320304,3,1,2,2,2,1,1,1,1,2 |
||||
1330439,4,7,8,3,4,10,9,1,1,4 |
||||
333093,1,1,1,1,3,1,1,1,1,2 |
||||
369565,4,1,1,1,3,1,1,1,1,2 |
||||
412300,10,4,5,4,3,5,7,3,1,4 |
||||
672113,7,5,6,10,4,10,5,3,1,4 |
||||
749653,3,1,1,1,2,1,2,1,1,2 |
||||
769612,3,1,1,2,2,1,1,1,1,2 |
||||
769612,4,1,1,1,2,1,1,1,1,2 |
||||
798429,4,1,1,1,2,1,3,1,1,2 |
||||
807657,6,1,3,2,2,1,1,1,1,2 |
||||
8233704,4,1,1,1,1,1,2,1,1,2 |
||||
837480,7,4,4,3,4,10,6,9,1,4 |
||||
867392,4,2,2,1,2,1,2,1,1,2 |
||||
869828,1,1,1,1,1,1,3,1,1,2 |
||||
1043068,3,1,1,1,2,1,2,1,1,2 |
||||
1056171,2,1,1,1,2,1,2,1,1,2 |
||||
1061990,1,1,3,2,2,1,3,1,1,2 |
||||
1113061,5,1,1,1,2,1,3,1,1,2 |
||||
1116192,5,1,2,1,2,1,3,1,1,2 |
||||
1135090,4,1,1,1,2,1,2,1,1,2 |
||||
1145420,6,1,1,1,2,1,2,1,1,2 |
||||
1158157,5,1,1,1,2,2,2,1,1,2 |
||||
1171578,3,1,1,1,2,1,1,1,1,2 |
||||
1174841,5,3,1,1,2,1,1,1,1,2 |
||||
1184586,4,1,1,1,2,1,2,1,1,2 |
||||
1186936,2,1,3,2,2,1,2,1,1,2 |
||||
1197527,5,1,1,1,2,1,2,1,1,2 |
||||
1222464,6,10,10,10,4,10,7,10,1,4 |
||||
1240603,2,1,1,1,1,1,1,1,1,2 |
||||
1240603,3,1,1,1,1,1,1,1,1,2 |
||||
1241035,7,8,3,7,4,5,7,8,2,4 |
||||
1287971,3,1,1,1,2,1,2,1,1,2 |
||||
1289391,1,1,1,1,2,1,3,1,1,2 |
||||
1299924,3,2,2,2,2,1,4,2,1,2 |
||||
1306339,4,4,2,1,2,5,2,1,2,2 |
||||
1313658,3,1,1,1,2,1,1,1,1,2 |
||||
1313982,4,3,1,1,2,1,4,8,1,2 |
||||
1321264,5,2,2,2,1,1,2,1,1,2 |
||||
1321321,5,1,1,3,2,1,1,1,1,2 |
||||
1321348,2,1,1,1,2,1,2,1,1,2 |
||||
1321931,5,1,1,1,2,1,2,1,1,2 |
||||
1321942,5,1,1,1,2,1,3,1,1,2 |
||||
1321942,5,1,1,1,2,1,3,1,1,2 |
||||
1328331,1,1,1,1,2,1,3,1,1,2 |
||||
1328755,3,1,1,1,2,1,2,1,1,2 |
||||
1331405,4,1,1,1,2,1,3,2,1,2 |
||||
1331412,5,7,10,10,5,10,10,10,1,4 |
||||
1333104,3,1,2,1,2,1,3,1,1,2 |
||||
1334071,4,1,1,1,2,3,2,1,1,2 |
||||
1343068,8,4,4,1,6,10,2,5,2,4 |
||||
1343374,10,10,8,10,6,5,10,3,1,4 |
||||
1344121,8,10,4,4,8,10,8,2,1,4 |
||||
142932,7,6,10,5,3,10,9,10,2,4 |
||||
183936,3,1,1,1,2,1,2,1,1,2 |
||||
324382,1,1,1,1,2,1,2,1,1,2 |
||||
378275,10,9,7,3,4,2,7,7,1,4 |
||||
385103,5,1,2,1,2,1,3,1,1,2 |
||||
690557,5,1,1,1,2,1,2,1,1,2 |
||||
695091,1,1,1,1,2,1,2,1,1,2 |
||||
695219,1,1,1,1,2,1,2,1,1,2 |
||||
824249,1,1,1,1,2,1,3,1,1,2 |
||||
871549,5,1,2,1,2,1,2,1,1,2 |
||||
878358,5,7,10,6,5,10,7,5,1,4 |
||||
1107684,6,10,5,5,4,10,6,10,1,4 |
||||
1115762,3,1,1,1,2,1,1,1,1,2 |
||||
1217717,5,1,1,6,3,1,1,1,1,2 |
||||
1239420,1,1,1,1,2,1,1,1,1,2 |
||||
1254538,8,10,10,10,6,10,10,10,1,4 |
||||
1261751,5,1,1,1,2,1,2,2,1,2 |
||||
1268275,9,8,8,9,6,3,4,1,1,4 |
||||
1272166,5,1,1,1,2,1,1,1,1,2 |
||||
1294261,4,10,8,5,4,1,10,1,1,4 |
||||
1295529,2,5,7,6,4,10,7,6,1,4 |
||||
1298484,10,3,4,5,3,10,4,1,1,4 |
||||
1311875,5,1,2,1,2,1,1,1,1,2 |
||||
1315506,4,8,6,3,4,10,7,1,1,4 |
||||
1320141,5,1,1,1,2,1,2,1,1,2 |
||||
1325309,4,1,2,1,2,1,2,1,1,2 |
||||
1333063,5,1,3,1,2,1,3,1,1,2 |
||||
1333495,3,1,1,1,2,1,2,1,1,2 |
||||
1334659,5,2,4,1,1,1,1,1,1,2 |
||||
1336798,3,1,1,1,2,1,2,1,1,2 |
||||
1344449,1,1,1,1,1,1,2,1,1,2 |
||||
1350568,4,1,1,1,2,1,2,1,1,2 |
||||
1352663,5,4,6,8,4,1,8,10,1,4 |
||||
188336,5,3,2,8,5,10,8,1,2,4 |
||||
352431,10,5,10,3,5,8,7,8,3,4 |
||||
353098,4,1,1,2,2,1,1,1,1,2 |
||||
411453,1,1,1,1,2,1,1,1,1,2 |
||||
557583,5,10,10,10,10,10,10,1,1,4 |
||||
636375,5,1,1,1,2,1,1,1,1,2 |
||||
736150,10,4,3,10,3,10,7,1,2,4 |
||||
803531,5,10,10,10,5,2,8,5,1,4 |
||||
822829,8,10,10,10,6,10,10,10,10,4 |
||||
1016634,2,3,1,1,2,1,2,1,1,2 |
||||
1031608,2,1,1,1,1,1,2,1,1,2 |
||||
1041043,4,1,3,1,2,1,2,1,1,2 |
||||
1042252,3,1,1,1,2,1,2,1,1,2 |
||||
1057067,1,1,1,1,1,?,1,1,1,2 |
||||
1061990,4,1,1,1,2,1,2,1,1,2 |
||||
1073836,5,1,1,1,2,1,2,1,1,2 |
||||
1083817,3,1,1,1,2,1,2,1,1,2 |
||||
1096352,6,3,3,3,3,2,6,1,1,2 |
||||
1140597,7,1,2,3,2,1,2,1,1,2 |
||||
1149548,1,1,1,1,2,1,1,1,1,2 |
||||
1174009,5,1,1,2,1,1,2,1,1,2 |
||||
1183596,3,1,3,1,3,4,1,1,1,2 |
||||
1190386,4,6,6,5,7,6,7,7,3,4 |
||||
1190546,2,1,1,1,2,5,1,1,1,2 |
||||
1213273,2,1,1,1,2,1,1,1,1,2 |
||||
1218982,4,1,1,1,2,1,1,1,1,2 |
||||
1225382,6,2,3,1,2,1,1,1,1,2 |
||||
1235807,5,1,1,1,2,1,2,1,1,2 |
||||
1238777,1,1,1,1,2,1,1,1,1,2 |
||||
1253955,8,7,4,4,5,3,5,10,1,4 |
||||
1257366,3,1,1,1,2,1,1,1,1,2 |
||||
1260659,3,1,4,1,2,1,1,1,1,2 |
||||
1268952,10,10,7,8,7,1,10,10,3,4 |
||||
1275807,4,2,4,3,2,2,2,1,1,2 |
||||
1277792,4,1,1,1,2,1,1,1,1,2 |
||||
1277792,5,1,1,3,2,1,1,1,1,2 |
||||
1285722,4,1,1,3,2,1,1,1,1,2 |
||||
1288608,3,1,1,1,2,1,2,1,1,2 |
||||
1290203,3,1,1,1,2,1,2,1,1,2 |
||||
1294413,1,1,1,1,2,1,1,1,1,2 |
||||
1299596,2,1,1,1,2,1,1,1,1,2 |
||||
1303489,3,1,1,1,2,1,2,1,1,2 |
||||
1311033,1,2,2,1,2,1,1,1,1,2 |
||||
1311108,1,1,1,3,2,1,1,1,1,2 |
||||
1315807,5,10,10,10,10,2,10,10,10,4 |
||||
1318671,3,1,1,1,2,1,2,1,1,2 |
||||
1319609,3,1,1,2,3,4,1,1,1,2 |
||||
1323477,1,2,1,3,2,1,2,1,1,2 |
||||
1324572,5,1,1,1,2,1,2,2,1,2 |
||||
1324681,4,1,1,1,2,1,2,1,1,2 |
||||
1325159,3,1,1,1,2,1,3,1,1,2 |
||||
1326892,3,1,1,1,2,1,2,1,1,2 |
||||
1330361,5,1,1,1,2,1,2,1,1,2 |
||||
1333877,5,4,5,1,8,1,3,6,1,2 |
||||
1334015,7,8,8,7,3,10,7,2,3,4 |
||||
1334667,1,1,1,1,2,1,1,1,1,2 |
||||
1339781,1,1,1,1,2,1,2,1,1,2 |
||||
1339781,4,1,1,1,2,1,3,1,1,2 |
||||
13454352,1,1,3,1,2,1,2,1,1,2 |
||||
1345452,1,1,3,1,2,1,2,1,1,2 |
||||
1345593,3,1,1,3,2,1,2,1,1,2 |
||||
1347749,1,1,1,1,2,1,1,1,1,2 |
||||
1347943,5,2,2,2,2,1,1,1,2,2 |
||||
1348851,3,1,1,1,2,1,3,1,1,2 |
||||
1350319,5,7,4,1,6,1,7,10,3,4 |
||||
1350423,5,10,10,8,5,5,7,10,1,4 |
||||
1352848,3,10,7,8,5,8,7,4,1,4 |
||||
1353092,3,2,1,2,2,1,3,1,1,2 |
||||
1354840,2,1,1,1,2,1,3,1,1,2 |
||||
1354840,5,3,2,1,3,1,1,1,1,2 |
||||
1355260,1,1,1,1,2,1,2,1,1,2 |
||||
1365075,4,1,4,1,2,1,1,1,1,2 |
||||
1365328,1,1,2,1,2,1,2,1,1,2 |
||||
1368267,5,1,1,1,2,1,1,1,1,2 |
||||
1368273,1,1,1,1,2,1,1,1,1,2 |
||||
1368882,2,1,1,1,2,1,1,1,1,2 |
||||
1369821,10,10,10,10,5,10,10,10,7,4 |
||||
1371026,5,10,10,10,4,10,5,6,3,4 |
||||
1371920,5,1,1,1,2,1,3,2,1,2 |
||||
466906,1,1,1,1,2,1,1,1,1,2 |
||||
466906,1,1,1,1,2,1,1,1,1,2 |
||||
534555,1,1,1,1,2,1,1,1,1,2 |
||||
536708,1,1,1,1,2,1,1,1,1,2 |
||||
566346,3,1,1,1,2,1,2,3,1,2 |
||||
603148,4,1,1,1,2,1,1,1,1,2 |
||||
654546,1,1,1,1,2,1,1,1,8,2 |
||||
654546,1,1,1,3,2,1,1,1,1,2 |
||||
695091,5,10,10,5,4,5,4,4,1,4 |
||||
714039,3,1,1,1,2,1,1,1,1,2 |
||||
763235,3,1,1,1,2,1,2,1,2,2 |
||||
776715,3,1,1,1,3,2,1,1,1,2 |
||||
841769,2,1,1,1,2,1,1,1,1,2 |
||||
888820,5,10,10,3,7,3,8,10,2,4 |
||||
897471,4,8,6,4,3,4,10,6,1,4 |
||||
897471,4,8,8,5,4,5,10,4,1,4 |
@ -0,0 +1,126 @@
|
||||
Citation Request: |
||||
This breast cancer databases was obtained from the University of Wisconsin |
||||
Hospitals, Madison from Dr. William H. Wolberg. If you publish results |
||||
when using this database, then please include this information in your |
||||
acknowledgements. Also, please cite one or more of: |
||||
|
||||
1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear |
||||
programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18. |
||||
|
||||
2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of |
||||
pattern separation for medical diagnosis applied to breast cytology", |
||||
Proceedings of the National Academy of Sciences, U.S.A., Volume 87, |
||||
December 1990, pp 9193-9196. |
||||
|
||||
3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition |
||||
via linear programming: Theory and application to medical diagnosis", |
||||
in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying |
||||
Li, editors, SIAM Publications, Philadelphia 1990, pp 22-30. |
||||
|
||||
4. K. P. Bennett & O. L. Mangasarian: "Robust linear programming |
||||
discrimination of two linearly inseparable sets", Optimization Methods |
||||
and Software 1, 1992, 23-34 (Gordon & Breach Science Publishers). |
||||
|
||||
1. Title: Wisconsin Breast Cancer Database (January 8, 1991) |
||||
|
||||
2. Sources: |
||||
-- Dr. WIlliam H. Wolberg (physician) |
||||
University of Wisconsin Hospitals |
||||
Madison, Wisconsin |
||||
USA |
||||
-- Donor: Olvi Mangasarian (mangasarian@cs.wisc.edu) |
||||
Received by David W. Aha (aha@cs.jhu.edu) |
||||
-- Date: 15 July 1992 |
||||
|
||||
3. Past Usage: |
||||
|
||||
Attributes 2 through 10 have been used to represent instances. |
||||
Each instance has one of 2 possible classes: benign or malignant. |
||||
|
||||
1. Wolberg,~W.~H., \& Mangasarian,~O.~L. (1990). Multisurface method of |
||||
pattern separation for medical diagnosis applied to breast cytology. In |
||||
{\it Proceedings of the National Academy of Sciences}, {\it 87}, |
||||
9193--9196. |
||||
-- Size of data set: only 369 instances (at that point in time) |
||||
-- Collected classification results: 1 trial only |
||||
-- Two pairs of parallel hyperplanes were found to be consistent with |
||||
50% of the data |
||||
-- Accuracy on remaining 50% of dataset: 93.5% |
||||
-- Three pairs of parallel hyperplanes were found to be consistent with |
||||
67% of data |
||||
-- Accuracy on remaining 33% of dataset: 95.9% |
||||
|
||||
2. Zhang,~J. (1992). Selecting typical instances in instance-based |
||||
learning. In {\it Proceedings of the Ninth International Machine |
||||
Learning Conference} (pp. 470--479). Aberdeen, Scotland: Morgan |
||||
Kaufmann. |
||||
-- Size of data set: only 369 instances (at that point in time) |
||||
-- Applied 4 instance-based learning algorithms |
||||
-- Collected classification results averaged over 10 trials |
||||
-- Best accuracy result: |
||||
-- 1-nearest neighbor: 93.7% |
||||
-- trained on 200 instances, tested on the other 169 |
||||
-- Also of interest: |
||||
-- Using only typical instances: 92.2% (storing only 23.1 instances) |
||||
-- trained on 200 instances, tested on the other 169 |
||||
|
||||
4. Relevant Information: |
||||
|
||||
Samples arrive periodically as Dr. Wolberg reports his clinical cases. |
||||
The database therefore reflects this chronological grouping of the data. |
||||
This grouping information appears immediately below, having been removed |
||||
from the data itself: |
||||
|
||||
Group 1: 367 instances (January 1989) |
||||
Group 2: 70 instances (October 1989) |
||||
Group 3: 31 instances (February 1990) |
||||
Group 4: 17 instances (April 1990) |
||||
Group 5: 48 instances (August 1990) |
||||
Group 6: 49 instances (Updated January 1991) |
||||
Group 7: 31 instances (June 1991) |
||||
Group 8: 86 instances (November 1991) |
||||
----------------------------------------- |
||||
Total: 699 points (as of the donated datbase on 15 July 1992) |
||||
|
||||
Note that the results summarized above in Past Usage refer to a dataset |
||||
of size 369, while Group 1 has only 367 instances. This is because it |
||||
originally contained 369 instances; 2 were removed. The following |
||||
statements summarizes changes to the original Group 1's set of data: |
||||
|
||||
##### Group 1 : 367 points: 200B 167M (January 1989) |
||||
##### Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805 |
||||
##### Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record |
||||
##### : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial |
||||
##### : Changed 0 to 1 in field 6 of sample 1219406 |
||||
##### : Changed 0 to 1 in field 8 of following sample: |
||||
##### : 1182404,2,3,1,1,1,2,0,1,1,1 |
||||
|
||||
5. Number of Instances: 699 (as of 15 July 1992) |
||||
|
||||
6. Number of Attributes: 10 plus the class attribute |
||||
|
||||
7. Attribute Information: (class attribute has been moved to last column) |
||||
|
||||
# Attribute Domain |
||||
-- ----------------------------------------- |
||||
1. Sample code number id number |
||||
2. Clump Thickness 1 - 10 |
||||
3. Uniformity of Cell Size 1 - 10 |
||||
4. Uniformity of Cell Shape 1 - 10 |
||||
5. Marginal Adhesion 1 - 10 |
||||
6. Single Epithelial Cell Size 1 - 10 |
||||
7. Bare Nuclei 1 - 10 |
||||
8. Bland Chromatin 1 - 10 |
||||
9. Normal Nucleoli 1 - 10 |
||||
10. Mitoses 1 - 10 |
||||
11. Class: (2 for benign, 4 for malignant) |
||||
|
||||
8. Missing attribute values: 16 |
||||
|
||||
There are 16 instances in Groups 1 to 6 that contain a single missing |
||||
(i.e., unavailable) attribute value, now denoted by "?". |
||||
|
||||
9. Class distribution: |
||||
|
||||
Benign: 458 (65.5%) |
||||
Malignant: 241 (34.5%) |
|
@ -0,0 +1,73 @@
|
||||
Citation Request: |
||||
This breast cancer domain was obtained from the University Medical Centre, |
||||
Institute of Oncology, Ljubljana, Yugoslavia. Thanks go to M. Zwitter and |
||||
M. Soklic for providing the data. Please include this citation if you plan |
||||
to use this database. |
||||
|
||||
1. Title: Breast cancer data (Michalski has used this) |
||||
|
||||
2. Sources: |
||||
-- Matjaz Zwitter & Milan Soklic (physicians) |
||||
Institute of Oncology |
||||
University Medical Center |
||||
Ljubljana, Yugoslavia |
||||
-- Donors: Ming Tan and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu) |
||||
-- Date: 11 July 1988 |
||||
|
||||
3. Past Usage: (Several: here are some) |
||||
-- Michalski,R.S., Mozetic,I., Hong,J., & Lavrac,N. (1986). The |
||||
Multi-Purpose Incremental Learning System AQ15 and its Testing |
||||
Application to Three Medical Domains. In Proceedings of the |
||||
Fifth National Conference on Artificial Intelligence, 1041-1045, |
||||
Philadelphia, PA: Morgan Kaufmann. |
||||
-- accuracy range: 66%-72% |
||||
-- Clark,P. & Niblett,T. (1987). Induction in Noisy Domains. In |
||||
Progress in Machine Learning (from the Proceedings of the 2nd |
||||
European Working Session on Learning), 11-30, Bled, |
||||
Yugoslavia: Sigma Press. |
||||
-- 8 test results given: 65%-72% accuracy range |
||||
-- Tan, M., & Eshelman, L. (1988). Using weighted networks to |
||||
represent classification knowledge in noisy domains. Proceedings |
||||
of the Fifth International Conference on Machine Learning, 121-134, |
||||
Ann Arbor, MI. |
||||
-- 4 systems tested: accuracy range was 68%-73.5% |
||||
-- Cestnik,G., Konenenko,I, & Bratko,I. (1987). Assistant-86: A |
||||
Knowledge-Elicitation Tool for Sophisticated Users. In I.Bratko |
||||
& N.Lavrac (Eds.) Progress in Machine Learning, 31-45, Sigma Press. |
||||
-- Assistant-86: 78% accuracy |
||||
|
||||
4. Relevant Information: |
||||
This is one of three domains provided by the Oncology Institute |
||||
that has repeatedly appeared in the machine learning literature. |
||||
(See also lymphography and primary-tumor.) |
||||
|
||||
This data set includes 201 instances of one class and 85 instances of |
||||
another class. The instances are described by 9 attributes, some of |
||||
which are linear and some are nominal. |
||||
|
||||
5. Number of Instances: 286 |
||||
|
||||
6. Number of Attributes: 9 + the class attribute |
||||
|
||||
7. Attribute Information: |
||||
1. Class: no-recurrence-events, recurrence-events |
||||
2. age: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99. |
||||
3. menopause: lt40, ge40, premeno. |
||||
4. tumor-size: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, |
||||
45-49, 50-54, 55-59. |
||||
5. inv-nodes: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26, |
||||
27-29, 30-32, 33-35, 36-39. |
||||
6. node-caps: yes, no. |
||||
7. deg-malig: 1, 2, 3. |
||||
8. breast: left, right. |
||||
9. breast-quad: left-up, left-low, right-up, right-low, central. |
||||
10. irradiat: yes, no. |
||||
|
||||
8. Missing Attribute Values: (denoted by "?") |
||||
Attribute #: Number of instances with missing values: |
||||
6. 8 |
||||
9. 1. |
||||
|
||||
9. Class Distribution: |
||||
1. no-recurrence-events: 201 instances |
||||
2. recurrence-events: 85 instances |
@ -0,0 +1,301 @@
|
||||
The Forest CoverType dataset |
||||
|
||||
|
||||
1. Title of Database: |
||||
|
||||
Forest Covertype data |
||||
|
||||
|
||||
2. Sources: |
||||
|
||||
(a) Original owners of database: |
||||
Remote Sensing and GIS Program |
||||
Department of Forest Sciences |
||||
College of Natural Resources |
||||
Colorado State University |
||||
Fort Collins, CO 80523 |
||||
(contact Jock A. Blackard, jblackard 'at' fs.fed.us |
||||
or Dr. Denis J. Dean, denis.dean 'at' utdallas.edu) |
||||
|
||||
NOTE: Reuse of this database is unlimited with retention of |
||||
copyright notice for Jock A. Blackard and Colorado |
||||
State University. |
||||
|
||||
(b) Donors of database: |
||||
Jock A. Blackard (jblackard 'at' fs.fed.us) |
||||
GIS Coordinator |
||||
USFS - Forest Inventory & Analysis |
||||
Rocky Mountain Research Station |
||||
507 25th Street |
||||
Ogden, UT 84401 |
||||
|
||||
Dr. Denis J. Dean (denis.dean 'at' utdallas.edu) |
||||
Professor |
||||
Program in Geography and Geospatial Sciences |
||||
School of Economic, Political and Policy Sciences |
||||
800 West Campbell Rd |
||||
Richardson, TX 75080-3021 |
||||
|
||||
Dr. Charles W. Anderson (anderson 'at' cs.colostate.edu) |
||||
Associate Professor |
||||
Department of Computer Science |
||||
Colorado State University |
||||
Fort Collins, CO 80523 USA |
||||
|
||||
(c) Date donated: August 1998 |
||||
|
||||
|
||||
3. Past Usage: |
||||
|
||||
Blackard, Jock A. and Denis J. Dean. 2000. "Comparative |
||||
Accuracies of Artificial Neural Networks and Discriminant |
||||
Analysis in Predicting Forest Cover Types from Cartographic |
||||
Variables." Computers and Electronics in Agriculture |
||||
24(3):131-151. |
||||
|
||||
Blackard, Jock A. and Denis J. Dean. 1998. "Comparative |
||||
Accuracies of Neural Networks and Discriminant Analysis |
||||
in Predicting Forest Cover Types from Cartographic |
||||
Variables." Second Southern Forestry GIS Conference. |
||||
University of Georgia. Athens, GA. Pages 189-199. |
||||
|
||||
Blackard, Jock A. 1998. "Comparison of Neural Networks and |
||||
Discriminant Analysis in Predicting Forest Cover Types." |
||||
Ph.D. dissertation. Department of Forest Sciences. |
||||
Colorado State University. Fort Collins, Colorado. |
||||
165 pages. |
||||
|
||||
Abstract of dissertation: |
||||
Natural resource managers responsible for developing |
||||
ecosystem management strategies require basic descriptive |
||||
information including inventory data for forested lands to |
||||
support their decision-making processes. However, managers |
||||
generally do not have this type of data for inholdings or |
||||
neighboring lands that are outside their immediate |
||||
jurisdiction. One method of obtaining this information is |
||||
through the use of predictive models. |
||||
Two predictive models were examined in this study, a |
||||
feedforward neural network model and a more traditional |
||||
statistical model based on discriminant analysis. The overall |
||||
objectives of this research were to first construct these two |
||||
predictive models, and second to compare and evaluate their |
||||
respective classification accuracies when predicting forest |
||||
cover types in undisturbed forests. |
||||
The study area included four wilderness areas found in |
||||
the Roosevelt National Forest of northern Colorado. A total |
||||
of twelve cartographic measures were utilized as independent |
||||
variables in the predictive models, while seven major forest |
||||
cover types were used as dependent variables. Several subsets |
||||
of these variables were examined to determine the best overall |
||||
predictive model. |
||||
For each subset of cartographic variables examined in |
||||
this study, relative classification accuracies indicate the |
||||
neural network approach outperformed the traditional |
||||
discriminant analysis method in predicting forest cover types. |
||||
The final neural network model had a higher absolute |
||||
classification accuracy (70.58%) than the final corresponding |
||||
linear discriminant analysis model(58.38%). In support of these |
||||
classification results, thirty additional networks with randomly |
||||
selected initial weights were derived. From these networks, the |
||||
overall mean absolute classification accuracy for the neural |
||||
network method was 70.52%, with a 95% confidence interval of |
||||
70.26% to 70.80%. Consequently, natural resource managers may |
||||
utilize an alternative method of predicting forest cover types |
||||
that is both superior to the traditional statistical methods and |
||||
adequate to support their decision-making processes for |
||||
developing ecosystem management strategies. |
||||
|
||||
|
||||
-- Classification performance |
||||
-- first 11,340 records used for training data subset |
||||
-- next 3,780 records used for validation data subset |
||||
-- last 565,892 records used for testing data subset |
||||
-- 70% Neural Network (backpropagation) |
||||
-- 58% Linear Discriminant Analysis |
||||
|
||||
|
||||
4. Relevant Information Paragraph: |
||||
|
||||
Predicting forest cover type from cartographic variables only |
||||
(no remotely sensed data). The actual forest cover type for |
||||
a given observation (30 x 30 meter cell) was determined from |
||||
US Forest Service (USFS) Region 2 Resource Information System |
||||
(RIS) data. Independent variables were derived from data |
||||
originally obtained from US Geological Survey (USGS) and |
||||
USFS data. Data is in raw form (not scaled) and contains |
||||
binary (0 or 1) columns of data for qualitative independent |
||||
variables (wilderness areas and soil types). |
||||
|
||||
This study area includes four wilderness areas located in the |
||||
Roosevelt National Forest of northern Colorado. These areas |
||||
represent forests with minimal human-caused disturbances, |
||||
so that existing forest cover types are more a result of |
||||
ecological processes rather than forest management practices. |
||||
|
||||
Some background information for these four wilderness areas: |
||||
Neota (area 2) probably has the highest mean elevational value of |
||||
the 4 wilderness areas. Rawah (area 1) and Comanche Peak (area 3) |
||||
would have a lower mean elevational value, while Cache la Poudre |
||||
(area 4) would have the lowest mean elevational value. |
||||
|
||||
As for primary major tree species in these areas, Neota would have |
||||
spruce/fir (type 1), while Rawah and Comanche Peak would probably |
||||
have lodgepole pine (type 2) as their primary species, followed by |
||||
spruce/fir and aspen (type 5). Cache la Poudre would tend to have |
||||
Ponderosa pine (type 3), Douglas-fir (type 6), and |
||||
cottonwood/willow (type 4). |
||||
|
||||
The Rawah and Comanche Peak areas would tend to be more typical of |
||||
the overall dataset than either the Neota or Cache la Poudre, due |
||||
to their assortment of tree species and range of predictive |
||||
variable values (elevation, etc.) Cache la Poudre would probably |
||||
be more unique than the others, due to its relatively low |
||||
elevation range and species composition. |
||||
|
||||
|
||||
5. Number of instances (observations): 581,012 |
||||
|
||||
|
||||
6. Number of Attributes: 12 measures, but 54 columns of data |
||||
(10 quantitative variables, 4 binary |
||||
wilderness areas and 40 binary |
||||
soil type variables) |
||||
|
||||
|
||||
7. Attribute information: |
||||
|
||||
Given is the attribute name, attribute type, the measurement unit and |
||||
a brief description. The forest cover type is the classification |
||||
problem. The order of this listing corresponds to the order of |
||||
numerals along the rows of the database. |
||||
|
||||
Name Data Type Measurement Description |
||||
|
||||
Elevation quantitative meters Elevation in meters |
||||
Aspect quantitative azimuth Aspect in degrees azimuth |
||||
Slope quantitative degrees Slope in degrees |
||||
Horizontal_Distance_To_Hydrology quantitative meters Horz Dist to nearest surface water features |
||||
Vertical_Distance_To_Hydrology quantitative meters Vert Dist to nearest surface water features |
||||
Horizontal_Distance_To_Roadways quantitative meters Horz Dist to nearest roadway |
||||
Hillshade_9am quantitative 0 to 255 index Hillshade index at 9am, summer solstice |
||||
Hillshade_Noon quantitative 0 to 255 index Hillshade index at noon, summer soltice |
||||
Hillshade_3pm quantitative 0 to 255 index Hillshade index at 3pm, summer solstice |
||||
Horizontal_Distance_To_Fire_Points quantitative meters Horz Dist to nearest wildfire ignition points |
||||
Wilderness_Area (4 binary columns) qualitative 0 (absence) or 1 (presence) Wilderness area designation |
||||
Soil_Type (40 binary columns) qualitative 0 (absence) or 1 (presence) Soil Type designation |
||||
Cover_Type (7 types) integer 1 to 7 Forest Cover Type designation |
||||
|
||||
|
||||
Code Designations: |
||||
|
||||
Wilderness Areas: 1 -- Rawah Wilderness Area |
||||
2 -- Neota Wilderness Area |
||||
3 -- Comanche Peak Wilderness Area |
||||
4 -- Cache la Poudre Wilderness Area |
||||
|
||||
Soil Types: 1 to 40 : based on the USFS Ecological |
||||
Landtype Units (ELUs) for this study area: |
||||
|
||||
Study Code USFS ELU Code Description |
||||
1 2702 Cathedral family - Rock outcrop complex, extremely stony. |
||||
2 2703 Vanet - Ratake families complex, very stony. |
||||
3 2704 Haploborolis - Rock outcrop complex, rubbly. |
||||
4 2705 Ratake family - Rock outcrop complex, rubbly. |
||||
5 2706 Vanet family - Rock outcrop complex complex, rubbly. |
||||
6 2717 Vanet - Wetmore families - Rock outcrop complex, stony. |
||||
7 3501 Gothic family. |
||||
8 3502 Supervisor - Limber families complex. |
||||
9 4201 Troutville family, very stony. |
||||
10 4703 Bullwark - Catamount families - Rock outcrop complex, rubbly. |
||||
11 4704 Bullwark - Catamount families - Rock land complex, rubbly. |
||||
12 4744 Legault family - Rock land complex, stony. |
||||
13 4758 Catamount family - Rock land - Bullwark family complex, rubbly. |
||||
14 5101 Pachic Argiborolis - Aquolis complex. |
||||
15 5151 unspecified in the USFS Soil and ELU Survey. |
||||
16 6101 Cryaquolis - Cryoborolis complex. |
||||
17 6102 Gateview family - Cryaquolis complex. |
||||
18 6731 Rogert family, very stony. |
||||
19 7101 Typic Cryaquolis - Borohemists complex. |
||||
20 7102 Typic Cryaquepts - Typic Cryaquolls complex. |
||||
21 7103 Typic Cryaquolls - Leighcan family, till substratum complex. |
||||
22 7201 Leighcan family, till substratum, extremely bouldery. |
||||
23 7202 Leighcan family, till substratum - Typic Cryaquolls complex. |
||||
24 7700 Leighcan family, extremely stony. |
||||
25 7701 Leighcan family, warm, extremely stony. |
||||
26 7702 Granile - Catamount families complex, very stony. |
||||
27 7709 Leighcan family, warm - Rock outcrop complex, extremely stony. |
||||
28 7710 Leighcan family - Rock outcrop complex, extremely stony. |
||||
29 7745 Como - Legault families complex, extremely stony. |
||||
30 7746 Como family - Rock land - Legault family complex, extremely stony. |
||||
31 7755 Leighcan - Catamount families complex, extremely stony. |
||||
32 7756 Catamount family - Rock outcrop - Leighcan family complex, extremely stony. |
||||
33 7757 Leighcan - Catamount families - Rock outcrop complex, extremely stony. |
||||
34 7790 Cryorthents - Rock land complex, extremely stony. |
||||
35 8703 Cryumbrepts - Rock outcrop - Cryaquepts complex. |
||||
36 8707 Bross family - Rock land - Cryumbrepts complex, extremely stony. |
||||
37 8708 Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony. |
||||
38 8771 Leighcan - Moran families - Cryaquolls complex, extremely stony. |
||||
39 8772 Moran family - Cryorthents - Leighcan family complex, extremely stony. |
||||
40 8776 Moran family - Cryorthents - Rock land complex, extremely stony. |
||||
|
||||
Note: First digit: climatic zone Second digit: geologic zones |
||||
1. lower montane dry 1. alluvium |
||||
2. lower montane 2. glacial |
||||
3. montane dry 3. shale |
||||
4. montane 4. sandstone |
||||
5. montane dry and montane 5. mixed sedimentary |
||||
6. montane and subalpine 6. unspecified in the USFS ELU Survey |
||||
7. subalpine 7. igneous and metamorphic |
||||
8. alpine 8. volcanic |
||||
|
||||
The third and fourth ELU digits are unique to the mapping unit |
||||
and have no special meaning to the climatic or geologic zones. |
||||
|
||||
Forest Cover Type Classes: 1 -- Spruce/Fir |
||||
2 -- Lodgepole Pine |
||||
3 -- Ponderosa Pine |
||||
4 -- Cottonwood/Willow |
||||
5 -- Aspen |
||||
6 -- Douglas-fir |
||||
7 -- Krummholz |
||||
|
||||
|
||||
8. Basic Summary Statistics for quantitative variables only |
||||
(whole dataset -- thanks to Phil Rennert for the summary values): |
||||
|
||||
Name Units Mean Std Dev |
||||
Elevation meters 2959.36 279.98 |
||||
Aspect azimuth 155.65 111.91 |
||||
Slope degrees 14.10 7.49 |
||||
Horizontal_Distance_To_Hydrology meters 269.43 212.55 |
||||
Vertical_Distance_To_Hydrology meters 46.42 58.30 |
||||
Horizontal_Distance_To_Roadways meters 2350.15 1559.25 |
||||
Hillshade_9am 0 to 255 index 212.15 26.77 |
||||
Hillshade_Noon 0 to 255 index 223.32 19.77 |
||||
Hillshade_3pm 0 to 255 index 142.53 38.27 |
||||
Horizontal_Distance_To_Fire_Points meters 1980.29 1324.19 |
||||
|
||||
|
||||
9. Missing Attribute Values: None. |
||||
|
||||
|
||||
10. Class distribution: |
||||
|
||||
Number of records of Spruce-Fir: 211840 |
||||
Number of records of Lodgepole Pine: 283301 |
||||
Number of records of Ponderosa Pine: 35754 |
||||
Number of records of Cottonwood/Willow: 2747 |
||||
Number of records of Aspen: 9493 |
||||
Number of records of Douglas-fir: 17367 |
||||
Number of records of Krummholz: 20510 |
||||
Number of records of other: 0 |
||||
|
||||
Total records: 581012 |
||||
|
||||
===================================================================== |
||||
Jock A. Blackard |
||||
08/28/1998 -- original text |
||||
12/07/1999 -- updated mailing address, citations, background info |
||||
for study area, added summary statistics. |
||||
===================================================================== |
||||
|
@ -0,0 +1,339 @@
|
||||
# D04 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
If you finished yesterday's exercices you should be able to train several Machine Learning algorithms and to choose one returned by GridSearchCV. |
||||
GridSearchCV returns the model that gives the best score on the test set. Yesterday, as I told you, I changed the **cv** parameter to compute the GridSearch with a train set and a test set. |
||||
It means that the selected model is based on one single measure. What if, by luck, we predict correctly on that section ? What if the best model is bad ? What if I could have selected a better model ? |
||||
|
||||
We will answer these questions today ! The topics we will cover are the one of the most important in Machine Learning. |
||||
Must read before to start the exercices: |
||||
|
||||
- Biais-Variance trade off; aka Underfitting/Overfitting. |
||||
- https://machinelearningmastery.com/gentle-introduction-to-the-bias-variance-trade-off-in-machine-learning/ |
||||
|
||||
- https://jakevdp.github.io/PythonDataScienceHandbook/05.03-hyperparameters-and-model-validation.html |
||||
|
||||
- Cross-validation |
||||
- https://algotrading101.com/learn/train-test-split/ |
||||
|
||||
- |
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
|
||||
|
||||
# Exercice 1: K-Fold |
||||
|
||||
The goal of this exercice is to learn to use `KFold` to split the data set in a k-fold cross validation. Most of the time you won't use this function to split your data because this function is used by others as `cross_val_score` or `cross_validate` or `GridSearchCV` ... . But, this allows to understand the splitting and to create a custom one if needed. |
||||
|
||||
``` |
||||
X = np.array(np.arange(1,21).reshape(10,-1)) |
||||
y = np.array(np.arange(1,11)) |
||||
``` |
||||
|
||||
1. Using `KFold`, perform a 5-fold cross validation. For each fold, print the train index and test index. The expected output is: |
||||
|
||||
``` |
||||
Fold: 1 |
||||
TRAIN: [2 3 4 5 6 7 8 9] TEST: [0 1] |
||||
|
||||
Fold: 2 |
||||
TRAIN: [0 1 4 5 6 7 8 9] TEST: [2 3] |
||||
|
||||
Fold: 3 |
||||
TRAIN: [0 1 2 3 6 7 8 9] TEST: [4 5] |
||||
|
||||
Fold: 4 |
||||
TRAIN: [0 1 2 3 4 5 8 9] TEST: [6 7] |
||||
|
||||
Fold: 5 |
||||
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] |
||||
``` |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output of the 5-fold cross validation is: |
||||
|
||||
|
||||
``` |
||||
Fold: 1 |
||||
TRAIN: [2 3 4 5 6 7 8 9] TEST: [0 1] |
||||
|
||||
Fold: 2 |
||||
TRAIN: [0 1 4 5 6 7 8 9] TEST: [2 3] |
||||
|
||||
Fold: 3 |
||||
TRAIN: [0 1 2 3 6 7 8 9] TEST: [4 5] |
||||
|
||||
Fold: 4 |
||||
TRAIN: [0 1 2 3 4 5 8 9] TEST: [6 7] |
||||
|
||||
Fold: 5 |
||||
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] |
||||
``` |
||||
|
||||
|
||||
|
||||
# Exercice 2: Cross validation (k-fold) |
||||
|
||||
The goal of this exercice is to learn how to use cross validation. After reading the articles you should be able to explain why we need to cross-validate the models. We will firstly focus on Linear Regression to reduce the computation time. We will be using `cross_validate` to run the cross validation. Note that `cross_val_score` is similar bu the `cross_validate` calculates one or more scores and timings for each CV split. |
||||
|
||||
Preliminary: |
||||
|
||||
- Import California Housing data set and split it in a train set and a test set (10%). Fit a linear regression on the data set. *The goal is to focus on the cross validation, that is why the code to fit the Linear Regression is given.* |
||||
|
||||
|
||||
|
||||
``` |
||||
#imports |
||||
from sklearn.datasets import fetch_california_housing |
||||
from sklearn.model_selection import train_test_split |
||||
from sklearn.linear_model import LinearRegression |
||||
from sklearn.preprocessing import StandardScaler |
||||
from sklearn.impute import SimpleImputer |
||||
from sklearn.pipeline import Pipeline |
||||
|
||||
#data |
||||
housing = fetch_california_housing() |
||||
X, y = housing['data'], housing['target'] |
||||
#split data train test |
||||
X_train, X_test, y_train, y_test = train_test_split(X, |
||||
y, |
||||
test_size=0.1, |
||||
shuffle=True, |
||||
random_state=43) |
||||
#pipeline |
||||
pipeline = [('imputer', SimpleImputer(strategy='median')), |
||||
('scaler', StandardScaler()), |
||||
('lr', LinearRegression())] |
||||
pipe = Pipeline(pipeline) |
||||
``` |
||||
|
||||
|
||||
|
||||
1. Cross validate the Pipeline using `cross_validate` with 10 folds. Print the scores on each validation sets, the mean score on the validation sets and the standard deviation on the validation sets. The expected output is: |
||||
|
||||
``` |
||||
Scores on validation sets: |
||||
[0.62433594 0.61648956 0.62486602 0.59891024 0.59284295 0.61307055 |
||||
0.54630341 0.60742976 0.60014575 0.59574508] |
||||
|
||||
Mean of scores on validation sets: |
||||
0.60201392526743 |
||||
|
||||
Standard deviation of scores on validation sets: |
||||
0.0214983822773466 |
||||
|
||||
``` |
||||
|
||||
**Note: It may be confusing that the key of the dictionnary that returns the results on the validation sets is `test_score`. Sometimes, the validation sets are called test sets. In that case, we run the cross validation on X_train. It means that the scores are computed on sets in the initial train set. The X_test is not used for the cross-validation.** |
||||
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html |
||||
https://machinelearningmastery.com/how-to-configure-k-fold-cross-validation/ |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output is: |
||||
|
||||
``` |
||||
Scores on validation sets: |
||||
[0.62433594 0.61648956 0.62486602 0.59891024 0.59284295 0.61307055 |
||||
0.54630341 0.60742976 0.60014575 0.59574508] |
||||
|
||||
Mean of scores on validation sets: |
||||
0.60201392526743 |
||||
|
||||
Standard deviation of scores on validation sets: |
||||
0.0214983822773466 |
||||
|
||||
``` |
||||
|
||||
The model is consistent across folds: it is stable. That's a first sign that the model is not overfitted. The average R2 is 60% that's a good start ! To be improved. |
||||
|
||||
|
||||
|
||||
# Exercice 3 GridsearchCV |
||||
|
||||
The goal of this exercice is to learn to use GridSearchCV to run a grid search, predict on the test set and score on the test set. |
||||
|
||||
Preliminary: |
||||
|
||||
- Import California Housing data set and split it in a train set and a test set (10%). Fit a linear regression on the data set. *The goal is to focus on the gridsearch, that is why the code to fit the Linear Regression is given.* |
||||
|
||||
|
||||
``` |
||||
#imports |
||||
from sklearn.datasets import fetch_california_housing |
||||
from sklearn.model_selection import train_test_split |
||||
from sklearn.linear_model import LinearRegression |
||||
from sklearn.preprocessing import StandardScaler |
||||
from sklearn.impute import SimpleImputer |
||||
from sklearn.pipeline import Pipeline |
||||
|
||||
#data |
||||
housing = fetch_california_housing() |
||||
X, y = housing['data'], housing['target'] |
||||
#split data train test |
||||
X_train, X_test, y_train, y_test = train_test_split(X, |
||||
y, |
||||
test_size=0.1, |
||||
shuffle=True, |
||||
random_state=43) |
||||
#pipeline |
||||
pipeline = [('imputer', SimpleImputer(strategy='median')), |
||||
('scaler', StandardScaler()), |
||||
('lr', LinearRegression())] |
||||
pipe = Pipeline(pipeline) |
||||
``` |
||||
|
||||
|
||||
1. Run `GridSearchCV` on all CPUs with 5 folds, MSE as score, Random Forest as model with: |
||||
|
||||
- max_depth between 1 and 20 (at least 3 values) |
||||
- n_estimators between 1 and 100 (at least 3 values) |
||||
|
||||
This may take few minutes to run. |
||||
|
||||
*Hint*: The name of the metric to put in the parameter `scoring` is `neg_mean_squared_error`. The smaller the MSE is, the better the model is. At the contrary, The greater the R2 is the better the model is. `GridSearchCV` chooses the best model by selecting the one that maximized the score on the validation sets. And, in mathetmatic, maximzing a function or minimzing its opposite is equivalent. More details: |
||||
https://stackoverflow.com/questions/21443865/scikit-learn-cross-validation-negative-values-with-mean-squared-error |
||||
|
||||
|
||||
2. Extract the best fitted estimator, print its params, print its score on the validation set and print `cv_results_`. |
||||
|
||||
3. Compute the score the test set. |
||||
|
||||
|
||||
WARNING: If the score used in classification is the AUC, there is one rare case where the AUC may return an error or a warning: The fold contains only one class. In that case it can't be computed, by definition. |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the code that runs the grid search is similar to: |
||||
|
||||
``` |
||||
parameters = {'n_estimators':[10, 50, 75], |
||||
'max_depth':[4, 7, 10]} |
||||
|
||||
rf = RandomForestRegressor() |
||||
gridsearch = GridSearchCV(rf, |
||||
parameters, |
||||
cv = 5, |
||||
n_jobs=-1, |
||||
scoring='neg_mean_squared_error') |
||||
|
||||
gridsearch.fit(X_train, y_train) |
||||
|
||||
``` |
||||
The answers that uses another list of parameters are accepted too ! |
||||
|
||||
2. This question is validated if you called this attributes: |
||||
|
||||
``` |
||||
print(gridsearch.best_score_) |
||||
print(gridsearch.best_params_) |
||||
print(gridsearch.cv_results_) |
||||
``` |
||||
The best score is -0.29028202683007526, that means that the MSE is ~0.29, it doesn't give any information since this metric is arbitrary. This score is the average of `neg_mean_squared_error` on all the validation sets. |
||||
|
||||
The best models params are `{'max_depth': 10, 'n_estimators': 75}`. |
||||
|
||||
As you may must have a different parameters list than this one, you should have different results. |
||||
|
||||
3. This question is validated if you used the fitted estimator to compute the score on the test set: `gridsearch.score(X_test, y_test)`. The MSE score is ~0.27. The score I got on the test set is close to the score I got on the validation sets. It means the models is not overfitted. |
||||
|
||||
|
||||
|
||||
# Exercice 5 Validation curve and Learning curve |
||||
|
||||
The goal of this exercice is to learn to analyse the models' performance with two tools: |
||||
- Validation curve |
||||
- Learning curve |
||||
|
||||
For this exercice we will use a dataset of 100k data points to give you an idea of the computation time you can expect during projects. |
||||
|
||||
Preliminary: |
||||
|
||||
- Using make_classification from sklearn, generate a binary data set with 100k data points and with 30 features. |
||||
|
||||
``` |
||||
X, y = make_classification(n_samples=100000, |
||||
n_features= 30, |
||||
n_informative=10, |
||||
flip_y=0.2 ) |
||||
``` |
||||
|
||||
|
||||
|
||||
1. Plot the validation curve, using all CPUs, with 5 folds. The goal is to focus again on max_depth between 1 and 20. |
||||
You may need to increase the window (example: between 1 and 50 ) if you notice that other values of max_depth could have returned better results. This may take few minutes. |
||||
|
||||
I do not expect that you implement all the plot from scratch, you'd better leverage the code here: |
||||
https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve |
||||
|
||||
The plot should look like this: |
||||
|
||||
![alt text][logo_ex5q1] |
||||
|
||||
[logo_ex5q1]: images/day5/ex5/w2_day5_ex5_q1.png "Validation curve " |
||||
|
||||
The interpretation is that from max_depth=10, the train score keeps increasing but the test score (or validation score) reaches a plateau. It means that choosing max_depth = 20 may lead to have an overfitted model. |
||||
|
||||
Note: Given the time computation is is not possible to plot the validation curve for all parameters. It is useful to plot it for parameters that control the overfitting the most. |
||||
|
||||
More details: https://chrisalbon.com/machine_learning/model_evaluation/plot_the_validation_curve/ |
||||
|
||||
|
||||
2. Let us assume the gridsearch returned `clf = RandomForestClassifier(max_depth=12)`. Let's check if the models underfits, overfit or fits correctly. Plot the learning curve. These two ressources will help you a lot to understand how to analyse the learning curves and how to plot them: |
||||
|
||||
- https://machinelearningmastery.com/learning-curves-for-diagnosing-machine-learning-model-performance/ |
||||
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py |
||||
|
||||
|
||||
- **Re-use the function in the second ressource**, change the cross validation to a classic 10-folds, run the learning curve data computation on all CPUs and plot the three plots as shown below. |
||||
|
||||
![alt text][logo_ex5q2] |
||||
|
||||
[logo_ex5q2]: images/day5/ex5/w2_day5_ex5_q2.png "Learning curve " |
||||
|
||||
- **Note Plot Learning Curves**: The learning curves is detailed in the first ressource. |
||||
- **Note Plot Scalibility of the model**: This plot shows the relationship between the time to train the model and the number of rows in the data. In that case the relationship is linear. |
||||
- **Note Performance of the model**: This plot shows wether it worths increasing the training time by adding data to increase the score. It would worth to add data to increase the score if the curve hasn't reach a plateau yet. In that case, increasing the training time by 10 units increases the score by less than 0.001. |
||||
## Correction |
||||
|
||||
1. This question is validated if the outputted plot looks like: |
||||
![alt text][logo_ex5q1] |
||||
|
||||
[logo_ex5q1]: images/day5/ex5/w2_day5_ex5_q1.png "Validation curve " |
||||
|
||||
The code that generated the data in the plot is: |
||||
|
||||
|
||||
``` |
||||
from sklearn.model_selection import validation_curve |
||||
|
||||
clf = RandomForestClassifier() |
||||
param_range = np.arange(1,30,2) |
||||
train_scores, test_scores = validation_curve(clf, |
||||
X, |
||||
y, |
||||
param_name="max_depth", |
||||
param_range=param_range, |
||||
scoring="roc_auc", |
||||
n_jobs=-1) |
||||
|
||||
``` |
||||
|
||||
2. This question is validated if the ouput is |
||||
|
||||
![alt text][logo_ex5q2] |
||||
|
||||
[logo_ex5q2]: images/day5/ex5/w2_day5_ex5_q2.png "Learning curve " |
||||
|
||||
|
@ -0,0 +1,481 @@
|
||||
# W2D01 Piscine AI - Data Science |
||||
|
||||
The goal of this day is to understand practical Linear regression and supervised learning. |
||||
|
||||
|
||||
Author: |
||||
|
||||
# Table of Contents: |
||||
Historical part: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
The word "regression" was introduced by Sir Francis Galton (a cousin of C. Darwin) when he |
||||
studied the size of individuals within a progeny. He was trying to understand why |
||||
large individuals in a population appeared to have smaller children, more |
||||
close to the average population size; hence the introduction of the term "regression". |
||||
|
||||
Today we will learn a basic algorithm used in **supervised learning** : **The Linear Regression**. We will be using **Scikit-learn** which is a machine learning library. It is designed to to interoperate with the Python libraries NumPy and Pandas. |
||||
We will also learn progressively the Machine Learning methodology for supervised learning - today we will focus on evalutatig a machine learning model by splitting the data set in a train set and a test set. |
||||
|
||||
'0.22.1' |
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
### To start with Scikit-learn: |
||||
- https://scikit-learn.org/stable/tutorial/basic/tutorial.html |
||||
|
||||
- https://jakevdp.github.io/PythonDataScienceHandbook/05.02-introducing-scikit-learn.html |
||||
|
||||
https://scikit-learn.org/stable/modules/linear_model.html |
||||
|
||||
### Machine learning methodology and algorithms: |
||||
|
||||
- This course provides a broad introduction to machine learning, datamining, and statistical pattern recognition. Andrew Ng is a star in the Machine Learning community. I recommend to spend some time during the projects to focus on some algorithms. However, Python is not the langage used for the course. https://www.coursera.org/learn/machine-learning |
||||
|
||||
- https://docs.microsoft.com/en-us/azure/machine-learning/algorithm-cheat-sheet |
||||
|
||||
https://scikit-learn.org/stable/tutorial/index.html |
||||
|
||||
### Linear Regression |
||||
|
||||
- https://towardsdatascience.com/laymans-introduction-to-linear-regression-8b334a3dab09 |
||||
|
||||
- https://towardsdatascience.com/linear-regression-the-actually-complete-introduction-67152323fcf2 |
||||
|
||||
### Train test split |
||||
|
||||
- https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/ |
||||
- https://developers.google.com/machine-learning/crash-course/training-and-test-sets/video-lecture?hl=en |
||||
|
||||
|
||||
# Exercice 1 Scikit-learn estimator |
||||
|
||||
The goal of this exercice is to learn to fit a Scikit-learn estimator and use it to predict. |
||||
|
||||
``` |
||||
|
||||
X, y = [[1],[2.1],[3]], [[1],[2],[3]] |
||||
|
||||
``` |
||||
1. Fit a LinearRegression from Scikit-learn with X the features and y the target. |
||||
|
||||
2. Predict for `x_pred = [[4]]` |
||||
|
||||
3. Print the coefficients (`coefs_`) and the intercept (`intercept_`), the score (`score`)of the regression of X and y. |
||||
|
||||
|
||||
## Correction |
||||
|
||||
|
||||
1. This question is validated if the ouput of the fitted model is: |
||||
|
||||
``` |
||||
|
||||
LinearRegression(copy_X=True, fit_intercept=[[1], [2.1], [3]], n_jobs=None, |
||||
normalize=[[1], [2], [3]]) |
||||
``` |
||||
|
||||
2. This question is validated if the ouput is: |
||||
|
||||
``` |
||||
array([[3.96013289]]) |
||||
``` |
||||
3. This question is validated if the ouptut is: |
||||
|
||||
``` |
||||
Coefficients: [[0.99667774]] |
||||
Intercept: [-0.02657807] |
||||
Score: 0.9966777408637874 |
||||
``` |
||||
|
||||
|
||||
# Exercice 2 Linear regression in 1D |
||||
|
||||
The goal of this exercice is to understand how the linear regression works in one dimension. To do so, we will generate a data in one dimension. Using `make regression` from Scikit-learn, generate a data set with 100 observations: |
||||
|
||||
``` |
||||
X, y, coef = make_regression(n_samples=100, |
||||
n_features=1, |
||||
n_informative=1, |
||||
noise=10, |
||||
coef=True, |
||||
random_state=0, |
||||
bias=100.0) |
||||
``` |
||||
|
||||
1. Plot the data using matplotlib. The plot should look like this: |
||||
|
||||
![alt text][q1] |
||||
|
||||
[q1]: images/day1/ex2/w2_day1_ex2_q1.png "Scatter plot" |
||||
|
||||
2. Fit a LinearRegression from Scikit-learn on the generated data and give the equation of the fitted line. The expected output is: `y = coef * x + intercept` |
||||
3. Add the fitted line to the plot. the plot should look like this: |
||||
|
||||
![alt text][q3] |
||||
|
||||
[q3]: images/day1/ex2/w2_day1_ex2_q3.png "Scatter plot + fitted line" |
||||
|
||||
4. Predict on X |
||||
5. Create a function that computes the Mean Squared Error (MSE) and compute the MSE on the data set. *The MSE is frequently used as well as other regression metrics that will be studied later this week.* |
||||
``` |
||||
def compute_mse(y_true, y_pred): |
||||
#TODO |
||||
return mse |
||||
``` |
||||
|
||||
Change the `noise` parameter of `make_regression` to 50 |
||||
|
||||
6. Repeat question 2, 4 and compute the MSE on the new data. |
||||
|
||||
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the plot looks like: |
||||
|
||||
![alt text][q1] |
||||
|
||||
[q1]: images/day1/ex2/w2_day1_ex2_q1.png "Scatter plot" |
||||
|
||||
2. This question is validated if the equation of the fitted line is: `y = 42.619430291366946 * x + 99.18581817296929 |
||||
` |
||||
|
||||
3. This question is validated if the plot looks like: |
||||
|
||||
![alt text][q3] |
||||
|
||||
[q3]: images/day1/ex2/w2_day1_ex2_q3.png "Scatter plot + fitted line" |
||||
|
||||
4. This question is validated if the outputted prediction for the first 10 values are: |
||||
|
||||
``` |
||||
array([ 83.86186727, 140.80961751, 116.3333897 , 64.52998689, |
||||
61.34889539, 118.10301628, 57.5347917 , 117.44107847, |
||||
108.06237908, 85.90762675]) |
||||
``` |
||||
5. This question is validated if the MSE returned is `114.17148616819485` |
||||
|
||||
6. This question is validated if the MSE returned is `2854.2871542048706` |
||||
|
||||
# Exercice 3: Train test split |
||||
|
||||
The goal of this exercice is to learn to split a data set. It is important to understand why we split the data in two sets. To put it in a nutshell: the Machine Learning algorithms learns on the training data and is evaluated on the that it hasn't seen before: the testing data. |
||||
|
||||
This video gives a basic and nice explanation: https://www.youtube.com/watch?v=_vdMKioCXqQ |
||||
|
||||
This article explains the conditions to split the data and how to split it: https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/ |
||||
|
||||
``` |
||||
X = np.arange(1,21).reshape(10,-1) |
||||
y = np.arange(1,11) |
||||
``` |
||||
1. Split the data using `train_test_split` with `shuffle=False`. The test set represents 20% of the total size of the data set. Print X_train, y_train, X_test, y_test. |
||||
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if X_train, y_train, X_test, y_test match this output: |
||||
|
||||
``` |
||||
X_train: |
||||
[[ 1 2] |
||||
[ 3 4] |
||||
[ 5 6] |
||||
[ 7 8] |
||||
[ 9 10] |
||||
[11 12] |
||||
[13 14] |
||||
[15 16]] |
||||
|
||||
|
||||
y_train: |
||||
[1 2 3 4 5 6 7 8] |
||||
|
||||
|
||||
X_test: |
||||
[[17 18] |
||||
[19 20]] |
||||
|
||||
|
||||
y_test: |
||||
[ 9 10] |
||||
``` |
||||
|
||||
# Exercice 4 Forecast diabetes progression |
||||
|
||||
|
||||
The goal of this exercice is to use Linear Regression to forecast the progression of diabetes. It will not always be precised, you should **ALWAYS** start doing an exploratory data analysis in order to have a good understanding of the data you model. As a reminder here an introduction to EDA: |
||||
https://towardsdatascience.com/exploratory-data-analysis-eda-a-practical-guide-and-template-for-structured-data-abfbf3ee3bd9 |
||||
|
||||
The data set used is described in https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes. |
||||
|
||||
``` |
||||
from sklearn.datasets import load_diabetes |
||||
diabetes = load_diabetes() |
||||
X, y = diabetes.data, diabetes.target |
||||
``` |
||||
1. Using `train_test_split`, split the data set in a train set and test set (20%). Use `random_state=43` for results reproducibility. |
||||
|
||||
2. Fit the Linear Regression on all the variables. Give the coefficients and the intercept of the Linear Regression. What is then the equation ? |
||||
|
||||
3. Predict on the test set. Predicting on the test set is like having new patients for who, as a physician, need to forecast the disease progression in one year given the 10 baseline variables. |
||||
|
||||
4. Compute the MSE on the train set and test set. Later this week we will learn about the R2 which will help us to evaluate the performance of this fitted Linear Regression. The MSE returns an arbitrary value depending on the range of error. |
||||
|
||||
|
||||
|
||||
WARNING: This will be explained later this week. But here, we are doing something "dangerous". As you may have read in the data documentation the data is scaled using the whole dataset whereas we should first scale the data on the training set and then use this scaling on the test set. This is a toy example, so let's ingore this detail for now. |
||||
|
||||
|
||||
|
||||
https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output of `y_train.values[:10]` and `y_test.values[:10]`are: |
||||
``` |
||||
y_train.values[:10]: |
||||
[[202.] |
||||
[ 55.] |
||||
[202.] |
||||
[ 42.] |
||||
[214.] |
||||
[173.] |
||||
[118.] |
||||
[ 90.] |
||||
[129.] |
||||
[151.]] |
||||
|
||||
y_test.values[:10]: |
||||
[[ 71.] |
||||
[ 72.] |
||||
[235.] |
||||
[277.] |
||||
[109.] |
||||
[ 61.] |
||||
[109.] |
||||
[ 78.] |
||||
[ 66.] |
||||
[192.]] |
||||
|
||||
``` |
||||
2. This question is validated if the coefficients and the intercept are: |
||||
|
||||
``` |
||||
[('age', -60.40163046086952), |
||||
('sex', -226.08740652083418), |
||||
('bmi', 529.383623302316), |
||||
('bp', 259.96307686274605), |
||||
('s1', -859.121931974365), |
||||
('s2', 504.70960058378813), |
||||
('s3', 157.42034928335502), |
||||
('s4', 226.29533600601638), |
||||
('s5', 840.7938070846119), |
||||
('s6', 34.712225788519554), |
||||
('intercept', 152.05314895029233)] |
||||
``` |
||||
|
||||
3. This question is validated if the output of `predictions_on_test[:10]` is: |
||||
|
||||
``` |
||||
array([[111.74351759], |
||||
[ 98.41335251], |
||||
[168.36373195], |
||||
[255.05882934], |
||||
[168.43764643], |
||||
[117.60982186], |
||||
[198.86966323], |
||||
[126.28961941], |
||||
[117.73121787], |
||||
[224.83346984]]) |
||||
|
||||
``` |
||||
4. This question is validated if the mse on the **train set** is `2888.326888` and the mse on the **test set** is `2858.255153`. |
||||
|
||||
|
||||
## Exercice 5 Gradient Descent |
||||
|
||||
The goal of this exercice is to understand how the Linear Regression algorithm finds the optimal coefficients. |
||||
|
||||
The goal is to fit a Linear Regression on a one dimensional features data **without using Scikit-learn**. Let's use the data set we generated for the exercice 1: |
||||
|
||||
|
||||
``` |
||||
X, y, coef = make_regression(n_samples=100, |
||||
n_features=1, |
||||
n_informative=1, |
||||
noise=10, |
||||
coef=True, |
||||
random_state=0, |
||||
bias=100.0) |
||||
``` |
||||
*Warning: The shape of X is not the same as the shape of y. You may need (for some questions) to reshape X using: `X.reshape(1,-1)[0]`.* |
||||
|
||||
1. Plot the data using matplotlib: |
||||
|
||||
![alt text][ex5q1] |
||||
|
||||
[ex5q1]: images/day1/ex5/w2_day1_ex5_q1.png "Scatter plot " |
||||
|
||||
As a reminder, fitting a Linear Regression on this data means finding (a,b) that fits well the data points. |
||||
|
||||
- y_pred = a*x +b |
||||
|
||||
Mathematically, it means finding (a,b) that minimizes the MSE, which is the loss used in Linear Regression. If we consider 3 data points: |
||||
|
||||
- Loss(a,b) = MSE(a,b) = |
||||
1/3 *((y_pred1 - y_true1)**2 + (y_pred2 - y_true2)**2) + (y_pred3 - y_true3)**2) |
||||
|
||||
and we know: |
||||
y_pred1 = a*x1 + b |
||||
y_pred2 = a*x2 + b |
||||
y_pred3 = a*x3 + b |
||||
|
||||
### Greedy approach |
||||
|
||||
2. Create a function `compute_mse`. Compute mse for `a = 1` and `b = 2`. |
||||
**Warning**: `X.shape` is `(100, 1)` and `y.shape` is `(100, )`. Make sure that `y_preds` and `y` have the same shape before to compute `y_preds-y`. |
||||
|
||||
``` |
||||
def compute_mse(coefs, X, y): |
||||
''' |
||||
coefs is a list that contains a and b: [a,b] |
||||
X is the features set |
||||
y is the target |
||||
|
||||
Returns a float which is the MSE |
||||
''' |
||||
|
||||
#TODO |
||||
|
||||
y_preds = |
||||
mse = |
||||
|
||||
return mse |
||||
``` |
||||
|
||||
|
||||
3. Create a grid of **640000** points that combines a and b with. Check that the grid contains 640000 points. |
||||
|
||||
- a between -200 and 200, step= 0.5 |
||||
- b between -200 and 200, step= 0.5 |
||||
|
||||
This is how to compute the grid with the combination of a and b: |
||||
|
||||
``` |
||||
aa, bb = np.mgrid[-200:200:0.5, -200:200:0.5] |
||||
grid = np.c_[aa.ravel(), bb.ravel()] |
||||
|
||||
``` |
||||
|
||||
4. Compute the MSE for all points in the grid. If possible, parallelize the computations. It may be needed to use `functools.partial` to parallelize a function with many parameters on a list. Put the result in a variable named `losses`. |
||||
|
||||
|
||||
5. Use this chunk of code to plot the MSE in 2D: |
||||
|
||||
``` |
||||
aa, bb = np.mgrid[-200:200:.5, -200:200:.5] |
||||
grid = np.c_[aa.ravel(), bb.ravel()] |
||||
losses_reshaped = np.array(losses).reshape(aa.shape) |
||||
|
||||
f, ax = plt.subplots(figsize=(8, 6)) |
||||
contour = ax.contourf(aa, |
||||
bb, |
||||
losses_reshaped, |
||||
100, |
||||
cmap="RdBu", |
||||
vmin=0, |
||||
vmax=160000) |
||||
ax_c = f.colorbar(contour) |
||||
ax_c.set_label("MSE") |
||||
|
||||
ax.set(aspect="equal", |
||||
xlim=(-200, 200), |
||||
ylim=(-200, 200), |
||||
xlabel="$a$", |
||||
ylabel="$b$") |
||||
``` |
||||
The expected output is: |
||||
|
||||
![alt text][ex5q5] |
||||
|
||||
[ex5q5]: images/day1/ex5/w2_day1_ex5_q5.png "MSE " |
||||
|
||||
|
||||
6. From the `losses` list, find the optimal value of a and b and plot the line in the scatter point of question 1. |
||||
|
||||
|
||||
|
||||
In this example we computed 160 000 times the MSE. It is frequent to deal with 50 features, which requires 51 parameters to fit the Linear Regression. If we try this approach with 50 features we would need to compute **5.07e+132** MSE. Even if we reduce the scope and try only 5 values per coefficients we would have to compute the MSE **4.4409e+35** times. This approach is not scalable and that is why is not used to find optimal coefficients for Linear Regression. |
||||
|
||||
### Gradient Descent |
||||
|
||||
In a nutshel, Gradient descent is an optimization algorithm used to minimize some function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient. In machine learning, we use gradient descent to update the parameters (a and b) of our model. Parameters refer to the coefficients used in Linear Regression. Before to start implementing the questions, take the time to read the article. https://jairiidriss.medium.com/gradient-descent-algorithm-from-scratch-using-python-2b36c1548917. It explains the gradient descent and how to implement it. The "tricky" part is the computation of the derivative of the mse. You can admit the formulas of the derivatives to implement the gradient descent (`d_theta_0` and `d_theta_1` in the article). |
||||
|
||||
7. Implement the gradient descent to find optimal a and b with `learning rate = 0.1` and `nbr_iterations=100`. |
||||
8. Save the a and b through the iterations in a two dimensional numpy array. Add them to the plot of the previous part and observe a and b that converge towards the minimum. The plot should look like this: |
||||
|
||||
![alt text][ex5q8] |
||||
|
||||
[ex5q8]: images/day1/ex5/w2_day1_ex5_q8.png "MSE + Gradient descent" |
||||
9. Use Linear Regression from Scikit-learn. Compare the results. |
||||
|
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the outputted plot looks like: |
||||
|
||||
![alt text][ex5q1] |
||||
|
||||
[ex5q1]: images/day1/ex5/w2_day1_ex5_q1.png "Scatter plot " |
||||
|
||||
2. This question is validated if the output is: `11808.867339751561` |
||||
|
||||
3. This question is validated if `grid.shape` is `(640000,2)`. |
||||
|
||||
4. This question is validated if the 10 first values of losses are: |
||||
|
||||
``` |
||||
array([158315.41493175, 158001.96852692, 157689.02212209, 157376.57571726, |
||||
157064.62931244, 156753.18290761, 156442.23650278, 156131.79009795, |
||||
155821.84369312, 155512.39728829]) |
||||
``` |
||||
|
||||
5. This question is validated if the outputted plot looks like |
||||
|
||||
![alt text][ex5q5] |
||||
|
||||
[ex5q5]: images/day1/ex5/w2_day1_ex5_q5.png "MSE" |
||||
|
||||
6. This question is validated if the point returned is |
||||
`array([42.5, 99. ])`. It means that `a= 42.5` and `b=99`. |
||||
|
||||
7. This question is validated if the coefficients returned are |
||||
``` |
||||
Coefficients (a): 42.61943031121358 |
||||
Intercept (b): 99.18581814447936 |
||||
|
||||
``` |
||||
|
||||
8. This question is validated if the outputted plot is |
||||
|
||||
![alt text][ex5q8] |
||||
|
||||
[ex5q8]: images/day1/ex5/w2_day1_ex5_q8.png "MSE + Gradient descent" |
||||
|
||||
|
||||
9. This question is validated if the coefficients and intercept returned are: |
||||
|
||||
``` |
||||
Coefficients: [42.61943029] |
||||
Intercept: 99.18581817296929 |
||||
|
||||
``` |
@ -0,0 +1,527 @@
|
||||
# W2D02 Piscine AI - Data Science |
||||
Classification |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
Classification |
||||
Today we will learn a different approach in Machine Learning: the classification which is a large domain in the field of statistics and machine learning. Generally, it can be broken down in two areas: |
||||
|
||||
- **Binary classification**, where we wish to group an outcome into one of two groups. |
||||
- **Multi-class classification**, where we wish to group an outcome into one of multiple (more than two) groups. |
||||
|
||||
|
||||
You may wonder why the approach is different from regression and why we don't use regression and define a threshold from where the class would 1 else 0 - in binary classification. |
||||
The main reason is that the linear regression is sensitive to outliers, hence the treshold would vary depending on the ouliers in the data. The article mentionned explains this reason with plots. To keep things simple, we can say that the ouput needed in classification is a probability to belong to one of the classes. So, by definition the value output by the classification model has to be between 0 and 1. The linear regression can't satisfy this contraint. |
||||
|
||||
In mathematics, there are functions with nice properties that take as input a real (-inf, inf) and output a value between 0 and 1, the most popular of them is the **sigmoid** - which is the inverse function of the logit, hence the name logistic regression. |
||||
|
||||
Let's take a small example to have a better understanding of the steps needed to perform a logistic regression on a binary data. Let's assume that we want to predict the gender given the people' size (height). |
||||
|
||||
Logistic regression steps: |
||||
|
||||
- Fit a sigmoid on the training data |
||||
- Compute sigmoid(size)=0.7 because the sigmoid returns values between 0 and 1 |
||||
- Return the class: 0.7 > 0.5 => class 1. Thus, the gender is male |
||||
|
||||
|
||||
More details: |
||||
https://towardsdatascience.com/understanding-logistic-regression-9b02c2aec102 |
||||
|
||||
|
||||
For the linear regression exercices, the loss (Mean Square Error - MSE) is minimized with an algorithm called **gradient descent**. In the classification, the loss MSE can't be used because the output of the model is 0 or 1 (for binary classfication). |
||||
The **logloss** or **cross entropy** is the loss used for classification. Similarly, it has some nice mathematical properties. The minimization of the **logloss** is not covered in the exercices. However, since it is used in most machine learning models for classification, I recommand to spend some time reading the related article. This article gives a nice example of how it works: |
||||
|
||||
https://towardsdatascience.com/cross-entropy-for-classification-d98e7f974451 |
||||
|
||||
|
||||
https://medium.com/swlh/what-is-logistic-regression-62807de62efa |
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
|
||||
|
||||
|
||||
# Exercice 1 Logistic regression in Scikit-learn |
||||
|
||||
The goal of this exercice is to learn to use Scikit-learn to classify data. |
||||
``` |
||||
X = [[0],[0.1],[0.2], [1],[1.1],[1.2], [1.3]] |
||||
y = [0,0,0,1,1,1,0] |
||||
|
||||
``` |
||||
|
||||
1. Fit a Logistic regression on X and y. |
||||
|
||||
2. Predict the class for `x_pred = [[0.5]]`. |
||||
|
||||
3. Predict the probabilities for `x_pred = [[0.5]]` using `predict_proba`. |
||||
|
||||
4. Print the coefficients (`coefs_`), the intercept (`intercept_`) and the score of the logistic regression of X and y. |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the fitted logistic regression returns: |
||||
|
||||
``` |
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, |
||||
intercept_scaling=1, l1_ratio=None, max_iter=100, |
||||
multi_class='auto', n_jobs=None, penalty='l2', |
||||
random_state=0, solver='lbfgs', tol=0.0001, verbose=0, |
||||
warm_start=False) |
||||
``` |
||||
|
||||
2. This question is validated if the predicted class is `0`. |
||||
|
||||
3. This question is validated if the predicted probabilities are `[0.61450526 0.38549474]` |
||||
|
||||
4. This question is validated if the output is: |
||||
|
||||
``` |
||||
Coefficient: |
||||
[[0.81786797]] |
||||
Intercept: |
||||
[-0.87522391] |
||||
Score: |
||||
0.7142857142857143 |
||||
|
||||
``` |
||||
|
||||
# Exercice 2 Sigmoid |
||||
|
||||
The goal of this exercice is to learn to compute and plot the sigmoid function. |
||||
|
||||
1. On the same plot, plot the sigmoid function and the custom sigmoids defined as: |
||||
``` |
||||
- sigmoid1(x) = 1/(1+ exp(-(0.5*x + 3))) |
||||
- sigmoid2(x) = 1/(1+ exp(-(5*x + 11))) |
||||
``` |
||||
- Add a line representing the probability=0.5 |
||||
|
||||
The plot should look like this: |
||||
|
||||
![alt text][ex2q1] |
||||
|
||||
[ex2q1]: images/day2/ex2/w2_day2_ex2_q1.png "Scatter plot" |
||||
|
||||
## Correction |
||||
|
||||
1. This questio is validated if the plot looks like this: |
||||
|
||||
|
||||
![alt text][ex2q1] |
||||
|
||||
[ex2q1]: images/day2/ex2/w2_day2_ex2_q1.png "Scatter plot" |
||||
|
||||
|
||||
|
||||
# Exercice 3 Decision boundary |
||||
|
||||
The goal of this exercice is to learn to fit a logistic regression on simple examples and to understand how the algorithm separated the data from the different classes. |
||||
|
||||
## 1 dimension |
||||
|
||||
First, we will start as usual with features data in 1 dimension. Use `make classification` from Scikit-learn to generate 100 data points: |
||||
|
||||
``` |
||||
X,y = make_classification( |
||||
n_samples=100, |
||||
n_features=1, |
||||
n_informative=1, |
||||
n_redundant=0, |
||||
n_repeated=0, |
||||
n_classes=2, |
||||
n_clusters_per_class=1, |
||||
weights=[0.5,0.5], |
||||
flip_y=0.15, |
||||
class_sep=2.0, |
||||
hypercube=True, |
||||
shift=1.0, |
||||
scale=1.0, |
||||
shuffle=True, |
||||
random_state=88 |
||||
) |
||||
|
||||
``` |
||||
|
||||
*Warning: The shape of X is not the same as the shape of y. You may need (for some questions) to reshape X using: `X.reshape(1,-1)[0]`.* |
||||
|
||||
1. Plot the data using a scatter plot. The x-axis contains the feature and y-axis contains the target. |
||||
The plot should look like this: |
||||
|
||||
|
||||
![alt text][ex3q1] |
||||
|
||||
[ex3q3]: images/day2/ex3/w2_day2_ex3_q3.png "Scatter plot" |
||||
|
||||
|
||||
|
||||
2. Fit a Logistic Regression on the generated data using scikit learn. Print the coefficients of the Logistic Regression. |
||||
3. Add to the previous plot the fitted sigmoid and the 0.5 probability line. The plot should look like this: |
||||
|
||||
|
||||
![alt text][ex3q3] |
||||
|
||||
[ex3q1]: images/day2/ex3/w2_day2_ex3_q1.png "Scatter plot + Logistic regression" |
||||
|
||||
|
||||
4. Create a function `predict_probability` that takes as input the data point and the coefficients and that returns the predicted probability. As a reminder, the probability is given by: p(x) = 1/(1+ exp(-(coef*x + intercept))). Check you have the same results as the method `predict_proba` from Scikit-learn. |
||||
|
||||
``` |
||||
def predict_probability(coefs, X): |
||||
''' |
||||
coefs is a list that contains a and b: [coef, intercept] |
||||
X is the features set |
||||
|
||||
Returns probability of X |
||||
''' |
||||
#TODO |
||||
probabilities = |
||||
|
||||
return probabilities |
||||
``` |
||||
|
||||
|
||||
|
||||
5. Create a function `predict_class` that takes as input the data point and the coefficients and that returns the predicted class. Check you have the same results as the class method `predict` output on the same data. |
||||
|
||||
|
||||
6. On the plot add the predicted class. The plot should look like this (the predicted class is shifted a bit to make the plot more understandable, but obviouly the predicted class is 0 or 1, not 0.1 or 0.9) |
||||
The plot should look like this: |
||||
|
||||
|
||||
![alt text][ex3q6] |
||||
|
||||
[ex3q6]: images/day2/ex3/w2_day2_ex3_q5.png "Scatter plot + Logistic regression + predictions" |
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 2 dimensions |
||||
|
||||
Now, let us repeat this process on 2-dimensional data. The goal is to focus on the decision boundary and to understand how the Logistic Regression create a line that separates the data. The code to plot the decision boundary is provided, however it is important to understand the way it works. |
||||
|
||||
- Generate 500 data points using: |
||||
|
||||
``` |
||||
X, y = make_classification(n_features=2, |
||||
n_redundant=0, |
||||
n_samples=250, |
||||
n_classes=2, |
||||
n_clusters_per_class=1, |
||||
flip_y=0.05, |
||||
class_sep=3, |
||||
random_state=43) |
||||
|
||||
``` |
||||
7. Fit the Logistic Regression on X and y and use the code below to plot the fitted sigmoid on the data set. |
||||
The plot should look like this: |
||||
|
||||
![alt text][ex3q7] |
||||
|
||||
[ex3q7]: images/day2/ex3/w2_day2_ex3_q6.png "Logistic regression decision boundary" |
||||
|
||||
|
||||
|
||||
``` |
||||
xx, yy = np.mgrid[-5:5:.01, -5:5:.01] |
||||
grid = np.c_[xx.ravel(), yy.ravel()] |
||||
#if needed change the line below |
||||
probs = clf.predict_proba(grid)[:, 1].reshape(xx.shape) |
||||
|
||||
f, ax = plt.subplots(figsize=(8, 6)) |
||||
contour = ax.contourf(xx, yy, probs, 25, cmap="RdBu", |
||||
vmin=0, vmax=1) |
||||
ax_c = f.colorbar(contour) |
||||
ax_c.set_label("$P(y = 1)$") |
||||
ax_c.set_ticks([0, .25, .5, .75, 1]) |
||||
|
||||
ax.scatter(X[:,0], X[:, 1], c=y, s=50, |
||||
cmap="RdBu", vmin=-.2, vmax=1.2, |
||||
edgecolor="white", linewidth=1) |
||||
|
||||
ax.set(aspect="equal", |
||||
xlim=(-5, 5), ylim=(-5, 5), |
||||
xlabel="$X_1$", ylabel="$X_2$") |
||||
|
||||
``` |
||||
The plot should look like this: |
||||
|
||||
|
||||
|
||||
https://stackoverflow.com/questions/28256058/plotting-decision-boundary-of-logistic-regression |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the outputted plot looks like this: |
||||
|
||||
![alt text][ex3q1] |
||||
|
||||
[ex3q1]: images/day2/ex3/w2_day2_ex3_q1.png "Scatter plot" |
||||
|
||||
|
||||
|
||||
|
||||
2. This question is validated if the coefficient and the intercept of the Logistic Regression are: |
||||
|
||||
``` |
||||
Intercept: [-0.98385574] |
||||
Coefficient: [[1.18866075]] |
||||
``` |
||||
3. This question is validated if the plot looks like this: |
||||
|
||||
|
||||
![alt text][ex3q2] |
||||
|
||||
[ex3q2]: images/day2/ex3/w2_day2_ex3_q3.png "Scatter plot" |
||||
|
||||
|
||||
|
||||
4. This question is validated if `predict_probability` outputs the same probabilities as `predict_proba`. Note that the values have to match one of the class probabilities, not both. To do so, compare your output with: `clf.predict_proba(X)[:,1]`. The shape of the arrays is not important. |
||||
|
||||
5. This question is validated if `predict_class` outputs the same classes as `cfl.predict(X)`. The shape of the arrays is not important. |
||||
|
||||
6. This question is validated if the plot looks like this: |
||||
|
||||
![alt text][ex3q6] |
||||
|
||||
[ex3q6]: images/day2/ex3/w2_day2_ex3_q5.png "Scatter plot + Logistic regression + predictions" |
||||
|
||||
As mentioned, it is not required to shift the class prediction to make the plot easier to understand. |
||||
|
||||
7. This question is validated if the plot looks like this: |
||||
|
||||
![alt text][ex3q7] |
||||
|
||||
[ex3q7]: images/day2/ex3/w2_day2_ex3_q6.png "Logistic regression decision boundary" |
||||
|
||||
|
||||
|
||||
# Exercice 4: Train test split |
||||
|
||||
The goal of this exercice is to learn to split a classification data set. The idea is the same as splitting a regression data set but there's one important detail specific to the classification: the proportion of each class in the train set and test set. |
||||
|
||||
|
||||
|
||||
``` |
||||
X = np.arange(1,21).reshape(10,-1) |
||||
y = np.zeros(10) |
||||
y[7:] = 1 |
||||
``` |
||||
1. Split the data using `train_test_split` with `shuffle=False`. The test set represents 20% of the total size of the data set. Print X_train, y_train, X_test, y_test. Compute the proportion of class `1` on the train set and test set. |
||||
|
||||
2. Having a train set with different properties than the test set is not recommanded. The analogy of the exam (https://www.youtube.com/watch?v=_vdMKioCXqQ) helps to understand this point: if the questions you have at the exam are completely different from what you prepared for you are not evaluated on what you learnt. The training set has to be representative of the data set. Now, split the data in a train set and test set, but keep the proportion of class `1` nearly constant. The parameter `shuffle` in theory works as it relies on a random sampling. The parameter `stratify` will always split the data and keep the same proportion of class `1` in the train set and test set. Using the parameter `stratify` split the data below and print the proportion of class `1` in the train set and train set. |
||||
|
||||
``` |
||||
X = np.arange(1,201).reshape(100,-1) |
||||
y = np.zeros(100) |
||||
y[70:] = 1 |
||||
``` |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if X_train, y_train, X_test, y_test match this output: |
||||
|
||||
``` |
||||
X_train: |
||||
[[ 1 2] |
||||
[ 3 4] |
||||
[ 5 6] |
||||
[ 7 8] |
||||
[ 9 10] |
||||
[11 12] |
||||
[13 14] |
||||
[15 16]] |
||||
|
||||
|
||||
y_train: |
||||
[0. 0. 0. 0. 0. 0. 0. 1.] |
||||
|
||||
|
||||
X_test: |
||||
[[17 18] |
||||
[19 20]] |
||||
|
||||
|
||||
y_test: |
||||
[1. 1.] |
||||
``` |
||||
The proportion of class `1` is **0.125** in the train set and **1.** in the test set. |
||||
|
||||
2. This question is validated if the proportion of class `1` is **0.3** for both sets. |
||||
|
||||
|
||||
# Exercice 5 Breast Cancer prediction |
||||
|
||||
The goal of this exercice is to use Logistic Regression |
||||
to predict breast cancer. It is always important to understand the data before training any Machine Learning algorithm. The data is described in **breast-cancer-wisconsin.names**. I suggest to add manually the column names in the DataFrame. |
||||
|
||||
Preliminary: |
||||
- If needed, replace missing values with the median of the column. |
||||
- Handle the column `Sample code number`. This column won't be used to train the model as it doesn't contain information on breast cancer. There are two solutions: drop it or set it as index. |
||||
|
||||
1. Print the proportion of class `Benign`. What would be the accuracy if the model always predicts `Benign`? |
||||
Later this week we will learn about other metrics as AUC that will help us to tackle high imbalanced data sets. |
||||
|
||||
2. Using train_test_split, split the data set in a train set and test set (20%). Both sets should should have approximately the same proportion of class `Benign`. Use `random_state = 43`. |
||||
3. Fit the logistic regression on the train set. Predict on the train set and test set. Compute the score on the train set and test set. 92-97% accuracy is expected on the test set. |
||||
4. Compute the confusion matrix on both tests. Analyse the number of false negative and false positive. |
||||
|
||||
|
||||
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html |
||||
|
||||
https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the proportion of class `Benign` is 0.6552217453505007. It means that if you always predict `Benign` your accuracy would be 66%. |
||||
|
||||
2. This question is validated if the proportion of one of the classes is the approximately the same on the train and test set: ~0.65. In my case: |
||||
|
||||
- test: 0.6571428571428571 |
||||
- train: 0.6547406082289803 |
||||
|
||||
|
||||
3. This question is validated if the output is: |
||||
|
||||
|
||||
``` |
||||
# Train |
||||
Class prediction on train set: |
||||
[4 2 4 2 2 2 2 4 2 2] |
||||
|
||||
Probability prediction on train set: |
||||
[0.99600415 0.00908666 0.99992744 0.00528803 0.02097154 0.00582772 |
||||
0.03565076 0.99515326 0.00788281 0.01065484] |
||||
|
||||
Score on train set: |
||||
0.9695885509838998 |
||||
|
||||
#Test |
||||
|
||||
Class prediction on test set: |
||||
[2 2 2 4 2 4 2 2 2 4] |
||||
|
||||
Probability prediction on test set: |
||||
[0.01747203 0.22495309 0.00698756 0.54020801 0.0015289 0.99862249 |
||||
0.33607994 0.01227679 0.00438157 0.99972344] |
||||
|
||||
Score on test set: |
||||
0.9642857142857143 |
||||
|
||||
``` |
||||
Only the 10 first predictions are outputted. The score is computed on all the data in the folds. |
||||
|
||||
For some reasons, you may have a different data splitting as mine. The requirement for this question is to have a score on the test set bigger than 92%. |
||||
|
||||
If the score is 1, congratulation you've leaked your first target. Drop the target from the X_train or X_test ;) ! |
||||
|
||||
4. This question is validated if the confusion matrix on the train set is similar to: |
||||
``` |
||||
array([[357, 9], |
||||
[ 8, 185]]) |
||||
``` |
||||
and if the confusion matrix on the test set is similar to: |
||||
|
||||
``` |
||||
array([[90, 2], |
||||
[ 3, 45]]) |
||||
``` |
||||
As said, for some reasons, you may have slighty different results because of the data splitting. However, the values you have in the confusion matrix should be close to these results. |
||||
|
||||
|
||||
# Exercice 6 Multi-class (Optional) |
||||
|
||||
The goal of this exercice is to learn to train a classfication algorithm on a multi-class labelled data. |
||||
Some algorithms as SVM or Logistic Regression do not natively support multi-class (more than 2 classes). There are some approaches that allow to use these algorithms on multi-class data. |
||||
Let's assume we work with 3 classes: A, B and C. |
||||
|
||||
- One-vs-Rest considers 3 binary classification problems: A vs B,C; B vs A,C and C vs A,B. If there are 10 classes, 10 binary classification problems would be fitted. |
||||
- One-vs-One considers 3 binary classification problems: A vs B, A vs C, B vs C. If there are 10 classes, 45 binary classification problems would be fitted. Given, the volume of data, this technique may not be scalable. |
||||
|
||||
More details: |
||||
|
||||
https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/ |
||||
|
||||
Let's implement the One-vs-Rest approach from `LogisticRegression`. |
||||
|
||||
Preliminary: |
||||
- Import the Setosa data set from Scikit-learn |
||||
|
||||
``` |
||||
from sklearn.datasets import load_iris |
||||
iris = load_iris() |
||||
|
||||
X = pd.DataFrame(data=iris['data'], columns=iris.feature_names) |
||||
y = pd.DataFrame(data=iris['target'], columns=['target']) |
||||
``` |
||||
- Using train_test_split, split the data set in a train set and test set (20%) with `shuffle=True` and `random_state=43`. |
||||
|
||||
|
||||
1. Create a function that takes as input the data and returns three **trained** classfiers. |
||||
- `clf0` takes as input a binary data set where the class 1 is `0` and class 0 is `1` and `2`. |
||||
- `clf1` takes as input a binary data set where the class 1 is `1` and class 0 is `0` and `2`. |
||||
- `clf2` takes as input a binary data set where the class 1 is `2` and class 0 is `0` and `1`. |
||||
|
||||
``` |
||||
def train(X_train,y_train): |
||||
#TODO |
||||
return clf0, clf1, clf2 |
||||
|
||||
``` |
||||
2. Create a function that takes as input the trained classifiers and the features set and that returns the predicted class. Use `predict_one_vs_all` to output the predicted classes on the test set. Compare the results with Logistic Regression algorithm from scikit learn used in One-vs-All mode. The results may change because the solver may not converge. Later this week, we will learn to preprocess the data to avoid convergence issues. |
||||
|
||||
- `clf0` outputs the probability to belong to the class 1 which is `0`. |
||||
- `clf1` outputs the probability to belong to the class 1 which is `1` |
||||
- `clf2` outputs the probability to belong to the class 1 which is `2` |
||||
|
||||
The predicted class is the one that gets the **highest probability** among the three models. |
||||
|
||||
def predict_one_vs_all(X, clf0, clf1, clf2 ): |
||||
#TODO |
||||
return classes |
||||
|
||||
|
||||
|
||||
|
||||
https://randerson112358.medium.com/python-logistic-regression-program-5e1b32f964db |
||||
|
||||
https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if each classifier has as input a binary data as below: |
||||
|
||||
``` |
||||
def train(X_train, y_train): |
||||
clf = LogisticRegression() |
||||
clf1 = LogisticRegression() |
||||
clf2 = LogisticRegression() |
||||
|
||||
clf.fit(X_train, y_train == 0) |
||||
clf1.fit(X_train, y_train == 1) |
||||
clf2.fit(X_train, y_train == 2) |
||||
|
||||
return clf, clf1, clf2 |
||||
``` |
||||
|
||||
2. This question is validated if the predicted classes on the test set are: |
||||
|
||||
``` |
||||
array([0, 0, 2, 1, 2, 0, 2, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, |
||||
0, 2, 2, 2, 0, 1, 0, 0]) |
||||
|
||||
``` |
||||
|
||||
Even if I had this warning `ConvergenceWarning: lbfgs failed to converge (status=1):` I noticed that `LogisticRegression` returns the same output. |
||||
|
||||
|
@ -0,0 +1,463 @@
|
||||
# D04 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
Today we will learn how to choose the right Machine Learning metric depending on the problem you are solving and to compute it. A metric gives an idea of how good the model performs. Depending on working on a classification problem or a regression problem the metrics considered are different. It is important to understand that all metrics are just metrics, not the truth. |
||||
|
||||
We will focus on the most important metrics: |
||||
|
||||
- Regression: |
||||
- **R2**, **Mean Square Error**, **Mean Absolute Error** |
||||
- Classification: |
||||
- **F1 score**, **accuracy**, **precision**, **recall** and **AUC scores**. Even if it not considered as a metric, the **confusion matrix** is always useful to understand the model performance. |
||||
|
||||
Warning: **Imbalanced data set** |
||||
|
||||
Let us assume we are predicting a rare event that occurs less than 2% of the time. Having a model that scores a good accuracy is easy, it doesn't have to be "smart", all it has to do is to always predict the majority class. Depending on the problem it can be disastrous. For example, working with real life data, breast cancer prediction is an imbalanced problem where predicting the majority leads to disastrous consequences. That is why metrics as AUC are useful. |
||||
https://stats.stackexchange.com/questions/260164/auc-and-class-imbalance-in-training-test-dataset |
||||
|
||||
|
||||
Before to compute the metrics, read carefully this article to undertand the role of these metrics. |
||||
https://www.kdnuggets.com/2018/06/right-metric-evaluating-machine-learning-models-2.html |
||||
|
||||
+ ML models + GS |
||||
|
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
https://scikit-learn.org/stable/modules/model_evaluation.html |
||||
|
||||
|
||||
# Exercice 1 MSE Scikit-learn |
||||
|
||||
The goal of this exercice is to learn to use `sklearn.metrics` to compute the mean squared error (MSE). |
||||
|
||||
1. Compute the MSE using `sklearn.metrics` on `y_true` and `y_pred` below: |
||||
|
||||
``` |
||||
y_true = [91, 51, 2.5, 2, -5] |
||||
y_pred = [90, 48, 2, 2, -4] |
||||
``` |
||||
## Correction |
||||
|
||||
1. This question is validated if the MSE outputted is **2.25**. |
||||
|
||||
|
||||
# Exercice 2 Accuracy Scikit-learn |
||||
|
||||
|
||||
The goal of this exercice is to learn to use `sklearn.metrics` to compute the accuracy. |
||||
|
||||
1. Compute the accuracy using `sklearn.metrics` on `y_true` and `y_pred` below: |
||||
|
||||
``` |
||||
y_pred = [0, 1, 0, 1, 0, 1, 0] |
||||
y_true = [0, 0, 1, 1, 1, 1, 0] |
||||
``` |
||||
## Correction |
||||
|
||||
1. This question is validated if the accuracy outputted is **0.5714285714285714**. |
||||
|
||||
|
||||
|
||||
# Exercice 3 Regression |
||||
|
||||
The goal of this exercice is to learn to evaluate a machine learning model using many regression metrics. |
||||
|
||||
Preliminary: |
||||
|
||||
- Import California Housing data set and split it in a train set and a test set (10%). Fit a linear regression on the data set. *The goal is focus on the metrics, that is why the code to fit the Linear Regression is given.* |
||||
|
||||
|
||||
|
||||
``` |
||||
#imports |
||||
from sklearn.datasets import fetch_california_housing |
||||
from sklearn.model_selection import train_test_split |
||||
from sklearn.linear_model import LinearRegression |
||||
from sklearn.preprocessing import StandardScaler |
||||
from sklearn.impute import SimpleImputer |
||||
from sklearn.pipeline import Pipeline |
||||
#data |
||||
housing = fetch_california_housing() |
||||
X, y = housing['data'], housing['target'] |
||||
#split data train test |
||||
X_train, X_test, y_train, y_test = train_test_split(X, |
||||
y, |
||||
test_size=0.1, |
||||
shuffle=True, |
||||
random_state=13) |
||||
#pipeline |
||||
pipeline = [('imputer', SimpleImputer(strategy='median')), |
||||
('scaler', StandardScaler()), |
||||
('lr', LinearRegression())] |
||||
pipe = Pipeline(pipeline) |
||||
#fit |
||||
pipe.fit(X_train, y_train) |
||||
|
||||
``` |
||||
|
||||
1. Predict on the train set and test set |
||||
|
||||
2. Compute R2, Mean Square Error, Mean Absolute Error on both train and test set |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the predictions on the train set and test set are: |
||||
``` |
||||
# 10 first values Train |
||||
array([1.54505951, 2.21338527, 2.2636205 , 3.3258957 , 1.51710076, |
||||
1.63209319, 2.9265211 , 0.78080924, 1.21968217, 0.72656239]) |
||||
|
||||
``` |
||||
|
||||
``` |
||||
#10 first values Test |
||||
|
||||
array([ 1.82212706, 1.98357668, 0.80547979, -0.19259114, 1.76072418, |
||||
3.27855815, 2.12056804, 1.96099917, 2.38239663, 1.21005304]) |
||||
``` |
||||
|
||||
2. This question is validated if the results match this output: |
||||
|
||||
``` |
||||
r2 on the train set: 0.3552292936915783 |
||||
MAE on the train set: 0.5300159371615256 |
||||
MSE on the train set: 0.5210784446797679 |
||||
|
||||
r2 on the test set: 0.30265471284464673 |
||||
MAE on the test set: 0.5454023699809112 |
||||
MSE on the test set: 0.5537420654727396 |
||||
|
||||
``` |
||||
This result shows that the model has slightly better results on the train set than the test set. That's frequent since it is easier to get a better grade on an exam we studied than an exam that is different from what was prepared. However, the results are not good: r2 ~ 0.3. Fitting non linear models as the Random Forest on this data may improve the results. That's the goal of the exercice 5. |
||||
|
||||
|
||||
|
||||
# Exercice 4 Classification |
||||
|
||||
The goal of this exercice is to learn to evaluate a machine learning model using many classification metrics. |
||||
|
||||
Preliminary: |
||||
|
||||
|
||||
- Import Breast Cancer data set and split it in a train set and a test set (20%). Fit a linear regression on the data set. *The goal is focus on the metrics, that is why the code to fit the Linear Regression is given.* |
||||
|
||||
|
||||
|
||||
``` |
||||
from sklearn.linear_model import LogisticRegression |
||||
from sklearn.datasets import load_breast_cancer |
||||
from sklearn.model_selection import train_test_split |
||||
from sklearn.preprocessing import StandardScaler |
||||
|
||||
X , y = load_breast_cancer(return_X_y=True) |
||||
X_train, X_test, y_train, y_test = train_test_split( |
||||
X, y, test_size=0.20) |
||||
scaler = StandardScaler() |
||||
X_train_scaled = scaler.fit_transform(X_train) |
||||
classifier = LogisticRegression() |
||||
classifier.fit(X_train_scaled, y_train) |
||||
|
||||
``` |
||||
|
||||
1. Predict on the train set and test set |
||||
|
||||
2. Compute F1, accuracy, precision, recall, roc_auc scores on the train set and test set. Print the confusion matrix on the test set results. |
||||
|
||||
**Note: AUC can only be computed on probabilities, not on classes.** |
||||
|
||||
3. Plot the AUC curve for on the test set using roc_curve of scikit learn. There many ways to create this plot. It should look like this: |
||||
|
||||
![alt text][logo_ex4] |
||||
|
||||
[logo_ex4]: images/day4/ex4/w2_day4_ex4_q3.png "ROC AUC " |
||||
|
||||
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_roc_curve.html |
||||
## Correction |
||||
|
||||
1. This question is validated if the predictions on the train set and test set are: |
||||
|
||||
``` |
||||
# 10 first values Train |
||||
array([1, 0, 1, 1, 1, 0, 0, 1, 1, 0]) |
||||
|
||||
# 10 first values Test |
||||
array([1, 1, 0, 0, 0, 1, 1, 1, 0, 0]) |
||||
``` |
||||
|
||||
2. This question is validated if the results match this output: |
||||
|
||||
``` |
||||
|
||||
F1 on the train set: 0.9911504424778761 |
||||
Accuracy on the train set: 0.989010989010989 |
||||
Recall on the train set: 0.9893992932862191 |
||||
Precision on the train set: 0.9929078014184397 |
||||
ROC_AUC on the train set: 0.9990161111794368 |
||||
|
||||
|
||||
F1 on the test set: 0.9801324503311258 |
||||
Accuracy on the test set: 0.9736842105263158 |
||||
Recall on the test set: 0.9736842105263158 |
||||
Precision on the test set: 0.9866666666666667 |
||||
ROC_AUC on the test set: 0.9863247863247864 |
||||
|
||||
``` |
||||
|
||||
The confusion matrix on the test set should be: |
||||
|
||||
``` |
||||
array([[37, 2], |
||||
[ 1, 74]]) |
||||
|
||||
``` |
||||
|
||||
3. The ROC AUC plot should look like: |
||||
|
||||
![alt text][logo_ex4] |
||||
|
||||
[logo_ex4]: images/day4/ex4/w2_day4_ex4_q3.png "ROC AUC " |
||||
|
||||
Having a 99% ROC AUC is not usual. The data set we used is easy to classify. On real data sets, always check if there's any leakage while having such a high ROC AUC score. |
||||
|
||||
|
||||
|
||||
# Exercice 5 Machine Learning models |
||||
|
||||
The goal of this exercice is to have an overview of the existing Machine Learning models and to learn to call them from scikit learn. |
||||
We will focus on: |
||||
|
||||
- SVM/ SVC |
||||
- Decision Tree |
||||
- Random Forest (Ensemble learning) |
||||
- Gradient Boosting (Ensemble learning, Boosting techniques) |
||||
|
||||
All these algorithms exist in two versions: regression and classification. Even if the logic is similar in both classification and regression, the loss function is specific to each case. |
||||
|
||||
It is really easy to get lost among all the existing algorithms. This article is very useful to have a clear overview of the models and to understand which algorithm use and when. https://towardsdatascience.com/how-to-choose-the-right-machine-learning-algorithm-for-your-application-1e36c32400b9 |
||||
|
||||
|
||||
Preliminary: |
||||
|
||||
- Import California Housing data set and split it in a train set and a test set (10%). Fit a linear regression on the data set. *The goal is to focus on the metrics, that is why the code to fit the Linear Regression is given.* |
||||
|
||||
|
||||
|
||||
``` |
||||
#imports |
||||
from sklearn.datasets import fetch_california_housing |
||||
from sklearn.model_selection import train_test_split |
||||
from sklearn.linear_model import LinearRegression |
||||
from sklearn.preprocessing import StandardScaler |
||||
from sklearn.impute import SimpleImputer |
||||
from sklearn.pipeline import Pipeline |
||||
#data |
||||
housing = fetch_california_housing() |
||||
X, y = housing['data'], housing['target'] |
||||
#split data train test |
||||
X_train, X_test, y_train, y_test = train_test_split(X, |
||||
y, |
||||
test_size=0.1, |
||||
shuffle=True, |
||||
random_state=43) |
||||
#pipeline |
||||
pipeline = [('imputer', SimpleImputer(strategy='median')), |
||||
('scaler', StandardScaler()), |
||||
('lr', LinearRegression())] |
||||
pipe = Pipeline(pipeline) |
||||
#fit |
||||
pipe.fit(X_train, y_train) |
||||
|
||||
``` |
||||
|
||||
1. Create 5 pipelines with 5 different models as final estimator (keep the imputer and scaler unchanged): |
||||
1. Linear Regression |
||||
2. SVM |
||||
3. Decision Tree (set `random_state=43`) |
||||
4. Random Forest (set `random_state=43`) |
||||
5. Gradient Boosting (set `random_state=43`) |
||||
|
||||
Take time to have basic understanding of the role of the basic hyperparameters and their defaut value. |
||||
- For each algorithm, print the R2, MSE and MAE on both train set and test set. |
||||
|
||||
## Correction |
||||
|
||||
1. Some of the algorithms use random steps (random sampling used by the `RandomForest`). I used `random_state = 43` for the Random Forest, the Decision Tree and the Gradient Boosting. This question is validated of the scores you got are close to: |
||||
|
||||
``` |
||||
# Linear regression |
||||
|
||||
TRAIN |
||||
r2 on the train set: 0.34823544284172625 |
||||
MAE on the train set: 0.533092001261455 |
||||
MSE on the train set: 0.5273648371379568 |
||||
|
||||
TEST |
||||
r2 on the test set: 0.3551785428138914 |
||||
MAE on the test set: 0.5196420310323713 |
||||
MSE on the test set: 0.49761195027083804 |
||||
|
||||
|
||||
# SVM |
||||
|
||||
TRAIN |
||||
r2 on the train set: 0.6462366150965996 |
||||
MAE on the train set: 0.38356451633259875 |
||||
MSE on the train set: 0.33464478671339165 |
||||
|
||||
TEST |
||||
r2 on the test set: 0.6162644671183826 |
||||
MAE on the test set: 0.3897680598426786 |
||||
MSE on the test set: 0.3477101776543003 |
||||
|
||||
|
||||
# Decision Tree |
||||
|
||||
TRAIN |
||||
r2 on the train set: 0.9999999999999488 |
||||
MAE on the train set: 1.3685733933909677e-08 |
||||
MSE on the train set: 6.842866883530944e-14 |
||||
|
||||
TEST |
||||
r2 on the test set: 0.6263651902480918 |
||||
MAE on the test set: 0.4383758696244002 |
||||
MSE on the test set: 0.4727017198871596 |
||||
|
||||
|
||||
# Random Forest |
||||
|
||||
TRAIN |
||||
r2 on the train set: 0.9705418471542886 |
||||
MAE on the train set: 0.11983836612191189 |
||||
MSE on the train set: 0.034538356420577995 |
||||
|
||||
TEST |
||||
r2 on the test set: 0.7504673649554309 |
||||
MAE on the test set: 0.31889891600404635 |
||||
MSE on the test set: 0.24096164834441108 |
||||
|
||||
|
||||
# Gradient Boosting |
||||
|
||||
TRAIN |
||||
r2 on the train set: 0.7395782392433273 |
||||
MAE on the train set: 0.35656543036682264 |
||||
MSE on the train set: 0.26167490389525294 |
||||
|
||||
TEST |
||||
r2 on the test set: 0.7157456298013534 |
||||
MAE on the test set: 0.36455447680396397 |
||||
MSE on the test set: 0.27058170064218096 |
||||
|
||||
``` |
||||
It is important to notice that the Decision Tree overfits very easily. It learns easily the training data but is not able to extrapolate on the test set. This algorithm is not used a lot. |
||||
However, Random Forest and Gradient Boosting propose a solid approach to correct the overfitting (in that case the parameters `max_depth` is set to None that is why the Random Forest overfits the data). These two algorithms are used intensively in Machine Learning Projets. |
||||
|
||||
# Exercice 6 Grid Search |
||||
|
||||
The goal of this exercice is to learn how to make an exhaustive search over specified parameter values for an estimator. This is very useful because the hyperparameters which are the paremeters of the model impact the performance of the model. |
||||
The scikit learn object that runs the Grid Search is called GridSearchCV. We will learn tomorrow about the cross validation. For now, let us set the parameter **cv** to `[(np.arange(18576), np.arange(18576,20640))]`. |
||||
This means that GridSearchCV splits the data set in a train and test set. |
||||
|
||||
Preliminary: |
||||
|
||||
- Load the California Housing data set. As precised, this time, there's no need to split the data set in train set and test set since GridSearchCV does it. |
||||
|
||||
You will have to run a Grid Search on the Random Forest on at least the hyperparameters that are mentioned below. It doesn't mean these are the only hyperparameters of the model. If possible, try at least 3 different values for each hyperparameters. |
||||
|
||||
1. Run a Grid Search with `n_jobs` set to `-1` to parallelize the computations on all CPUs. The hyperparameters to change are: n_estimators, max_depth, min_samples_leaf. It may take |
||||
|
||||
|
||||
|
||||
Now, let us analyse the grid search's results in order to select the best model. |
||||
|
||||
2. Write a function that takes as input the Grid Search object and that returns the best model **fitted**, the best set of hyperpameters and the associated score: |
||||
|
||||
|
||||
``` |
||||
def select_model_verbose(gs): |
||||
|
||||
return trained_model, best_params, best_score |
||||
``` |
||||
3. Use the trained model to predict on a new point: |
||||
|
||||
``` |
||||
new_point = np.array([[3.2031, 52., 5.47761194, 1.07960199, 910., 2.26368159, 37.85, -122.26]]) |
||||
``` |
||||
|
||||
|
||||
How do we know the best model returned by GridSearchCV is good enough and stable ? That is what we will learn tomorrow ! |
||||
|
||||
WARNING: |
||||
|
||||
Some combinations of hyper parameters are not possible. For example using the SVM, the kernel linear has no parameter gamma. |
||||
|
||||
|
||||
Note: |
||||
|
||||
- GridSearchCV can also take a Pipeline instead of a Machine Learning model. It is useful to combine some Imputers or Dimension reduction techniques with some Machine Learning models in the same Pipeline. |
||||
- It may be useful to check on Kaggle if some Kagglers share their Grid Searches. |
||||
|
||||
|
||||
Ressources: |
||||
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html |
||||
|
||||
https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search |
||||
|
||||
|
||||
https://medium.com/fintechexplained/what-is-grid-search-c01fe886ef0a |
||||
|
||||
https://elutins.medium.com/grid-searching-in-machine-learning-quick-explanation-and-python-implementation-550552200596 |
||||
|
||||
https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the code that runs the gridsearch is (the parameters may change): |
||||
|
||||
``` |
||||
parameters = {'n_estimators':[10, 50, 75], |
||||
'max_depth':[3,5,7], |
||||
'min_samples_leaf': [10,20,30]} |
||||
|
||||
rf = RandomForestRegressor() |
||||
gridsearch = GridSearchCV(rf, |
||||
parameters, |
||||
cv = [(np.arange(18576), np.arange(18576,20640))], |
||||
n_jobs=-1) |
||||
gridsearch.fit(X, y) |
||||
|
||||
|
||||
``` |
||||
|
||||
2. This question is validated if the function is: |
||||
|
||||
|
||||
``` |
||||
def select_model_verbose(gs): |
||||
|
||||
return gs.best_estimator_, gs.best_params_, gs.best_score_ |
||||
``` |
||||
In my case, the gridsearch parameters are not interesting. Even if I reduced the overfitting of the Random Forest, the score on the test is lower than the score on the test returned by the Gradient Boosting in the previous exercice without optimal parameters search. |
||||
|
||||
|
||||
3. This question is validated if the code used is: |
||||
|
||||
|
||||
``` |
||||
model, best_params, best_score = select_model_verbose(gridsearch) |
||||
model.predict(new_point) |
||||
|
||||
``` |
After Width: | Height: | Size: 39 KiB |
After Width: | Height: | Size: 53 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 124 KiB |
After Width: | Height: | Size: 54 KiB |
After Width: | Height: | Size: 127 KiB |
After Width: | Height: | Size: 53 KiB |
After Width: | Height: | Size: 30 KiB |
After Width: | Height: | Size: 47 KiB |
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 121 KiB |
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 176 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 53 KiB |
After Width: | Height: | Size: 79 KiB |
@ -0,0 +1,36 @@
|
||||
# D02 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
|
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
|
||||
|
||||
# Exercice 1 |
||||
|
||||
|
||||
|
||||
# Exercice 2 |
||||
|
||||
|
||||
|
||||
# Exercice 3 |
||||
|
||||
|
||||
|
||||
# Exercice 4 |
||||
|
||||
|
||||
|
||||
# Exercice 5 |
||||
|
After Width: | Height: | Size: 57 KiB |
@ -0,0 +1,106 @@
|
||||
Forest Cover Type Prediction |
||||
|
||||
The goal of this project is to use cartographic variables to classify forest categories. You will have to analyse the data, create features and to train a machine learning model on the cartographic data to make it as accurate as possible. |
||||
|
||||
|
||||
## Data |
||||
|
||||
The input files are `train.csv`, `test.csv` and `covtype.data`: |
||||
- `train.csv` |
||||
- `test.csv` |
||||
- `covtype.info` |
||||
The train data set is used to **analyse the data and calibrate the models**. The goal is to get the accuracy as high as possible on the test set. The test set will be available at the end of the last day to prevent from the overfitting of the test set. |
||||
|
||||
The data is described in `covtype.info`. |
||||
|
||||
## Stucture |
||||
|
||||
|
||||
The structure of the project is: |
||||
|
||||
``` |
||||
project |
||||
│ README.md |
||||
│ environment.yml |
||||
│ |
||||
└───data |
||||
│ │ train.csv |
||||
│ | test.csv (not available first day) |
||||
| | covtype.info |
||||
│ |
||||
└───notebook |
||||
│ │ EDA.ipynb |
||||
| |
||||
|───scripts |
||||
| │ preprocessing_feature_engineering.py |
||||
| │ model_selection.py |
||||
│ | predict.py |
||||
│ |
||||
└───results |
||||
│ plots |
||||
│ test_predictions.csv |
||||
│ best_model.pkl |
||||
|
||||
``` |
||||
|
||||
|
||||
|
||||
## 1. EDA and feature engineering: |
||||
|
||||
- Create a Jupyter Notebook to analyse the data sets and perform EDA (Exploratory Data Analysis). This notebook won't be evaluated. |
||||
|
||||
- *Hint: Examples of interesting features* |
||||
|
||||
- Distance to hydrology = sqrt((Horizontal_Distance_To_Hydrology)^2 + (Vertical_Distance_To_Hydrology)^2) |
||||
- Horizontal_Distance_To_Fire_Points - Horizontal_Distance_To_Roadways |
||||
|
||||
|
||||
## 2. Model Selection |
||||
|
||||
The model selection approach is a key step because, t should return the best model and garanty that the results are reproducible on the final test set. The goal of this step is to make sure that the results on the test set are not due to test set overfitting. It implies to split the data set as shown below: |
||||
|
||||
|
||||
|
||||
``` |
||||
DATA |
||||
└───TRAIN FILE (0) |
||||
│ └───── Train (1) |
||||
│ | Fold0: |
||||
| | Train |
||||
| | Validation |
||||
| | Fold1: |
||||
| | Train |
||||
| | Validation |
||||
... ... ... |
||||
| | |
||||
| └───── Test (1) |
||||
│ |
||||
└─── TEST FILE (0) (available last day) |
||||
|
||||
``` |
||||
|
||||
**Rules:** |
||||
- Split train test |
||||
- Cross validation: at least 5 folds |
||||
- Grid search on at least 5 different models: |
||||
- Gradient Boosting, KNN, Random Forest, SVM, Logistic Regression. *Remember that for some model scaling the data is important and for others it doesn't matter.* |
||||
|
||||
- Train accuracy score < **0.98**. Train set (0). Write the result in the README.md. |
||||
- Test (last day) accuracy > **0.65**. TEst set (0). Write the result in the README.md |
||||
- Display the confusion matrix for the best model in a DataFrame. Precise the index and columns names (True label and Predicted label) |
||||
- Plot the learning curve for the best model |
||||
- Save the trained model as a pickle file |
||||
|
||||
|
||||
**Advice: As the grid search takes time, I suggest to prepare and test the code. Once you're confident it works, run the gridsearch the night and analyse the results** |
||||
|
||||
*Hint*: The confusion matrix shows the misclassifications class per class. Try to detect if the model misclassifies badly one class with another. Then, do some research on the internet on the two forest cover types, find the differences and create some new features that underline these differences. More generaly, the methodology of a models learning is a cycle with several iterations. More details: https://serokell.io/blog/machine-learning-testing |
||||
|
||||
## 3. Predict (last day) |
||||
|
||||
Once you have selected the best model and you are confident it will perform well on new data, you're ready to predict on the test set: |
||||
|
||||
- Load the trained model |
||||
- Predict on the test set and compute the accuracy |
||||
- Save the predictions in a csv file |
||||
- Add your score on the README.md |
@ -0,0 +1,111 @@
|
||||
Forest Cover Type Prediction - Correction |
||||
|
||||
The goal of this project is to use cartographic variables to classify forest categories. You will have to analyse the data, create features and to train a machine learning model on the cartographic data to make it as accurate as possible. |
||||
|
||||
|
||||
|
||||
|
||||
## Problem |
||||
|
||||
|
||||
The expected structure of the project is: |
||||
|
||||
``` |
||||
project |
||||
│ README.md |
||||
│ environment.yml |
||||
│ |
||||
└───data |
||||
│ │ train.csv |
||||
│ | test.csv (not available first day) |
||||
| | covtype.info |
||||
│ |
||||
└───notebook |
||||
│ │ EDA.ipynb |
||||
| |
||||
|───scripts |
||||
| │ preprocessing_feature_engineering.py |
||||
| │ model_selection.py |
||||
│ | predict.py |
||||
│ |
||||
└───results |
||||
│ confusion_matrix_heatmap.png |
||||
│ learning_curve_best_model.png |
||||
│ test_predictions.csv |
||||
│ best_model.pkl |
||||
|
||||
``` |
||||
|
||||
- The readme file contains a description of the project and explains how to run the code from an empty environment. It also gives a summary of the implementation of each python file. The preprocessing which is a key part should be decribed precisely. Finally, it should contain a conclusion that gives the performance of the strategy. |
||||
|
||||
- The environment has to contain all libraries used and their versions that are necessary to run the code. |
||||
|
||||
- The notebook is not evaluated. |
||||
|
||||
|
||||
## 1. Preprocessing and features engineering: |
||||
|
||||
|
||||
|
||||
## 2. Model selection and predict |
||||
|
||||
### Data splitting |
||||
|
||||
The data splitting structure is: |
||||
|
||||
``` |
||||
DATA |
||||
└───TRAIN FILE (0) |
||||
│ └───── Train (1): |
||||
│ | Fold0: |
||||
| | Train |
||||
| | Validation |
||||
| | Fold1: |
||||
| | Train |
||||
| | Validation |
||||
... ... ... |
||||
| | |
||||
| └───── Test (1) |
||||
│ |
||||
└─── TEST FILE (0)(available last day) |
||||
|
||||
``` |
||||
|
||||
- The train set (0) id divised in a train set (1) and test set (1). The ratio is less than 33%. |
||||
- The cross validation splits the train set (1) is at least 5 folds. If the cross validation is stratified that's a good point but it is not a requirement. |
||||
|
||||
### Gridsearch |
||||
|
||||
- It contains at least these 5 different models: |
||||
- Gradient Boosting, KNN, Random Forest, SVM, Logistic Regression. |
||||
|
||||
There are many options: |
||||
- 5 grid searches on 1 model |
||||
- 1 grid search on 5 models |
||||
- 1 grid search on a pipeline that contains the preprocessing |
||||
- 5 grid searches on a pipeline that contains the preprocessing |
||||
|
||||
### Training |
||||
|
||||
- Check that the **target is removed from the X** matrix |
||||
|
||||
### Results |
||||
Run predict.py on the test set, check that: |
||||
- Test (last day) accuracy > **0.65**. |
||||
|
||||
Then, check: |
||||
- Train accuracy score < **0.98**. It can be checked on the learning curve. If you are not sure, load the model, load the training set (0), score on the training set (0). |
||||
- The confusion matrix is represented as a DataFrame. Example: |
||||
![alt text][confusion_matrix] |
||||
|
||||
[confusion_matrix]: images/weekend/w2_weekend_confusion_matrix.png "Confusion matrix " |
||||
|
||||
- The learning curve for the best model is plotted. Example: |
||||
|
||||
![alt text][logo_learning_curve] |
||||
|
||||
[logo_learning_curve]: images/weekend/w2_weekend_learning_curve.png "Learning curve " |
||||
|
||||
Note: The green line on the plot shows the accuracy on the validation set not on the test set (1) and not on the test set (0). |
||||
|
||||
- The trained model is saved as a pickle file |
|
After Width: | Height: | Size: 131 KiB |
After Width: | Height: | Size: 62 KiB |
@ -0,0 +1,36 @@
|
||||
# D02 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
|
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
|
||||
|
||||
# Exercice 1 |
||||
|
||||
|
||||
|
||||
# Exercice 2 |
||||
|
||||
|
||||
|
||||
# Exercice 3 |
||||
|
||||
|
||||
|
||||
# Exercice 4 |
||||
|
||||
|
||||
|
||||
# Exercice 5 |
||||
|
@ -0,0 +1,243 @@
|
||||
# D02 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
Keras backend TF |
||||
The goal of this day is to learn to use Keras to build Neural Networks. |
||||
|
||||
There are two ways to build Keras models: sequential and functional. |
||||
|
||||
The sequential API allows you to create models layer-by-layer for most problems. It is limited in that it does not allow you to create models that share layers or have multiple inputs or outputs. The exercices focuses on the usage of the sequential API. |
||||
|
||||
'2.4.3' |
||||
|
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
The correction will provide the code and output because it is not straightforward to reproduce results using Keras. There are many source of randomness. Even if all the seeds are fixed to a constant they may be other source of randomness. https://machinelearningmastery.com/reproducible-results-neural-networks-keras/ |
||||
A developper |
||||
## Ressources |
||||
https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/ |
||||
|
||||
# Exercice 1 Sequential |
||||
|
||||
The goal of this exercice is to learn to call the object `Sequential`. |
||||
|
||||
1. Put the object Sequential in a variable named `model` and print the variable `model`. |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output is: `<tensorflow.python.keras.engine.sequential.Sequential object at xxx` |
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Exercice 2 Dense |
||||
|
||||
The goal of this exercice is to learn to create layers of neurons. Keras proposes options to create custom layers. The neural networks build in these exercices do not require custom layers. `Dense` layers do the job. A dense layer is simply a layer where each unit or neuron is connected to each neuron in the next layer. As seen yesterday, there are three main types of layers: input, hidden and output. The **input layer** that specifies the number of inputs (features) is not represented as a layer in Keras. However, `Dense` has a parameter `input_dim` that gives the number of inputs in the previous layer. The output layer as any hidden layer can be created using `Dense`, the only difference is that the output layer contains one single neuron. |
||||
|
||||
1. Create a `Dense` layer with these parameters and return the output of `get_config`: |
||||
|
||||
- First hidden layer connected to 5 input variables. |
||||
- 8 neurons |
||||
- sigmoid as activation function |
||||
|
||||
|
||||
2. Create a `Dense` layer with these parameters and return the output of `get_config`: |
||||
|
||||
- Hidden layer (not the first one) |
||||
- 4 neurons |
||||
- sigmoid as activation function |
||||
|
||||
3. Create a `Dense` layer with these parameters and return the output of `get_config`: |
||||
|
||||
- Output layer |
||||
- 1 neuron |
||||
- sigmoid as activation function |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the fields`batch_input_shape`,`units` and `activation` match this output: |
||||
|
||||
``` |
||||
{'name': 'dense_7', |
||||
'trainable': True, |
||||
'batch_input_shape': (None, 5), |
||||
'dtype': 'float32', |
||||
'units': 8, |
||||
'activation': 'sigmoid', |
||||
'use_bias': True, |
||||
'kernel_initializer': {'class_name': 'GlorotUniform', |
||||
'config': {'seed': None}}, |
||||
'bias_initializer': {'class_name': 'Zeros', 'config': {}}, |
||||
'kernel_regularizer': None, |
||||
'bias_regularizer': None, |
||||
'activity_regularizer': None, |
||||
'kernel_constraint': None, |
||||
'bias_constraint': None} |
||||
``` |
||||
|
||||
2. This question is validated if the fields`units` and `activation` match this output: |
||||
|
||||
``` |
||||
{'name': 'dense_8', |
||||
'trainable': True, |
||||
'dtype': 'float32', |
||||
'units': 4, |
||||
'activation': 'sigmoid', |
||||
'use_bias': True, |
||||
'kernel_initializer': {'class_name': 'GlorotUniform', |
||||
'config': {'seed': None}}, |
||||
'bias_initializer': {'class_name': 'Zeros', 'config': {}}, |
||||
'kernel_regularizer': None, |
||||
'bias_regularizer': None, |
||||
'activity_regularizer': None, |
||||
'kernel_constraint': None, |
||||
'bias_constraint': None} |
||||
``` |
||||
3. This question is validated if the fields`units` and `activation` match this output: |
||||
|
||||
``` |
||||
{'name': 'dense_9', |
||||
'trainable': True, |
||||
'dtype': 'float32', |
||||
'units': 1, |
||||
'activation': 'sigmoid', |
||||
'use_bias': True, |
||||
'kernel_initializer': {'class_name': 'GlorotUniform', |
||||
'config': {'seed': None}}, |
||||
'bias_initializer': {'class_name': 'Zeros', 'config': {}}, |
||||
'kernel_regularizer': None, |
||||
'bias_regularizer': None, |
||||
'activity_regularizer': None, |
||||
'kernel_constraint': None, |
||||
'bias_constraint': None} |
||||
``` |
||||
|
||||
# Exercice 3 Architecture |
||||
|
||||
The goal of this exercice is to combine the layers and to create a neural network. |
||||
|
||||
1. Create a neural network for regression with the following architecture and return `print(model.summary())`: |
||||
|
||||
- 5 inputs variables |
||||
- hidden layer 1: 8 neurons and sigmoid as activation function |
||||
- hidden layer 2: 4 neurons and sigmoid as activation function |
||||
- output layer: 1 neuron. Find the adapted activation function |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the code that creates the neural network is: |
||||
|
||||
``` |
||||
model = keras.Sequential() |
||||
model.add(Dense(8, input_shape=(5,), activation= 'sigmoid')) |
||||
model.add(Dense(4, activation= 'sigmoid')) |
||||
model.add(Dense(1, activation= 'linear')) |
||||
|
||||
``` |
||||
|
||||
The first two layers could use another activation function that sigmoid (eg: relu) |
||||
# Exercice 4 Optimize |
||||
|
||||
The goal of this exercice is to learn to train the neural network. Once the architecture of the neural network is set there are two steps to train the neural network: |
||||
|
||||
- `compile`: The compilation step aims to set the loss function, to choose the algoithm to minimize the chosen loss function and to choose the metric the model outputs. |
||||
|
||||
- The **optimizer**. We’ll stick with a pretty good default: the Adam gradient-based optimizer. Keras has many other optimizers you can look into as well. |
||||
- The **loss function**. Depending on the problem to solve: classification or regression Keras proposes different loss functions. In classification Keras distinguishes between `binary_crossentropy` (2 classes) and `categorical_crossentropy` (>2 classes), so we’ll use the latter. |
||||
- The **metric(s)**. A list of metrics. Depending on the problem to solve: classification or regression Keras proposes different loss functions. For example for classification the metric can be the accuracy. |
||||
|
||||
|
||||
- `fit`: Training a model in Keras literally consists only of calling fit() and specifying some parameters. There are a lot of possible parameters, but we’ll only manually supply a few: |
||||
- The **training data**, commonly known as X and Y, respectively. |
||||
- The **number of epochs** (iterations over the entire dataset) to train for. |
||||
- The **batch size** (number of samples per gradient update) to use when training. |
||||
|
||||
This article gives more details about **epoch** and **batch size**: https://machinelearningmastery.com/difference-between-a-batch-and-an-epoch/ |
||||
|
||||
1. Create the following neural network (classification): |
||||
- Set the right number of inputs variables |
||||
- hidden layer 1: 10 neurons and sigmoid as activation function. |
||||
- hidden layer 2: 5 neurons and sigmoid as activation function. |
||||
- output layer: 1 neuron and sigmoid as activation function. |
||||
- Choose the accuracy metric, the adam optimizer, the adapted loss and epoch smaller than 50. |
||||
|
||||
Import the breast cancer data set from `sklearn.datasets` using `load_breast_cancer` and train the neural network on the data set. |
||||
|
||||
2. Scale the data using `StandardScaler` from `sklearn.preprocessing`. Train the neural network again. |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output of `model.get_config()['layers']` matches the fields `batch_input_shape`, `units` and `activation`. |
||||
|
||||
``` |
||||
[{'class_name': 'InputLayer', |
||||
'config': {'batch_input_shape': (None, 30), |
||||
'dtype': 'float32', |
||||
'sparse': False, |
||||
'ragged': False, |
||||
'name': 'dense_134_input'}}, |
||||
{'class_name': 'Dense', |
||||
'config': {'name': 'dense_134', |
||||
'trainable': True, |
||||
'batch_input_shape': (None, 30), |
||||
'dtype': 'float32', |
||||
'units': 10, |
||||
'activation': 'sigmoid', |
||||
'use_bias': True, |
||||
'kernel_initializer': {'class_name': 'GlorotUniform', |
||||
'config': {'seed': None}}, |
||||
'bias_initializer': {'class_name': 'Zeros', 'config': {}}, |
||||
'kernel_regularizer': None, |
||||
'bias_regularizer': None, |
||||
'activity_regularizer': None, |
||||
'kernel_constraint': None, |
||||
'bias_constraint': None}}, |
||||
{'class_name': 'Dense', |
||||
'config': {'name': 'dense_135', |
||||
'trainable': True, |
||||
'dtype': 'float32', |
||||
'units': 5, |
||||
'activation': 'sigmoid', |
||||
'use_bias': True, |
||||
'kernel_initializer': {'class_name': 'GlorotUniform', |
||||
'config': {'seed': None}}, |
||||
'bias_initializer': {'class_name': 'Zeros', 'config': {}}, |
||||
'kernel_regularizer': None, |
||||
'bias_regularizer': None, |
||||
'activity_regularizer': None, |
||||
'kernel_constraint': None, |
||||
'bias_constraint': None}}, |
||||
{'class_name': 'Dense', |
||||
'config': {'name': 'dense_136', |
||||
'trainable': True, |
||||
'dtype': 'float32', |
||||
'units': 1, |
||||
'activation': 'sigmoid', |
||||
'use_bias': True, |
||||
'kernel_initializer': {'class_name': 'GlorotUniform', |
||||
'config': {'seed': None}}, |
||||
'bias_initializer': {'class_name': 'Zeros', 'config': {}}, |
||||
'kernel_regularizer': None, |
||||
'bias_regularizer': None, |
||||
'activity_regularizer': None, |
||||
'kernel_constraint': None, |
||||
'bias_constraint': None}}] |
||||
``` |
||||
You should notice that the neural network is struggling to learn. By luck the initialization of the weights might have led to an accuracy close of 90%. But when I trained the neural network, with `batch_size=300` on the data here is the ouptput of the last epoch (50): |
||||
|
||||
`Epoch 50/50 |
||||
2/2 [==============================] - 0s 1ms/step - loss: 0.6559 - accuracy: 0.6274` |
||||
|
||||
2. This solution is validated if the the accuracy at epoch 50 is higher than 95%. |
||||
|
||||
|
||||
AA9FE32D1CA7E292E6A4C27145 |
@ -0,0 +1,250 @@
|
||||
# D02 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
Keras backend TF |
||||
The goal of this day is to learn to use Keras to build Neural Networks and train them on small data sets. |
||||
|
||||
classification & regression |
||||
|
||||
'2.4.3' |
||||
|
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
The correction will provide the code and output because it is not straightforward to reproduce results using Keras. There are many source of randomness. Even if all the seeds are fixed to a constant they may be other source of randomness. https://machinelearningmastery.com/reproducible-results-neural-networks-keras/ |
||||
A developper |
||||
## Ressources |
||||
https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/ |
||||
|
||||
|
||||
# Exercice 1 Regression - Optimize |
||||
|
||||
The goal of this exercice is to learn to set up the optimization for a regression neural network. There's no code to run in that exercice. In W2D2E3, we implemented a neural network designed for regression. We will be using this neural network: |
||||
|
||||
``` |
||||
model = keras.Sequential() |
||||
model.add(Dense(8, input_shape=(5,), activation= 'sigmoid')) |
||||
model.add(Dense(4, activation= 'sigmoid')) |
||||
model.add(Dense(1, activation= 'linear')) |
||||
|
||||
``` |
||||
As a reminder, the main difference between a regression and classification neural network's architecture is the output layer activation function. |
||||
|
||||
1. Fill this chunk of code to set up the optimization part of the regression neural network: |
||||
|
||||
``` |
||||
model.compile( |
||||
optimizer='adam', |
||||
loss='',#TODO1 |
||||
metrics=[''] #TODO2 |
||||
) |
||||
``` |
||||
Hint: |
||||
- Mean Squared Error (MSE) and Mean Absolute Error (MAE) are common loss functions used for regression problems. Mean Absolute Error is less sensitive to outliers. Different loss functions are used for classification problems. Similarly, evaluation metrics used for regression differ from classification. |
||||
|
||||
https://keras.io/api/metrics/regression_metrics/ |
||||
|
||||
## Correction: |
||||
|
||||
1. This question is validated if the chunk of code is: |
||||
|
||||
``` |
||||
model.compile( |
||||
optimizer='adam', |
||||
loss='mse', |
||||
metrics=['mse'] |
||||
) |
||||
``` |
||||
All regression metrics or losses used are correct. As explained before, the loss functions are chosen thanks to nice mathematical properties. That is why most of the time the loss function used for regression is the MSE or MAE. |
||||
|
||||
https://keras.io/api/losses/regression_losses/ |
||||
https://keras.io/api/metrics/regression_metrics/ |
||||
|
||||
|
||||
# Exercice 2 Regression example |
||||
|
||||
The goal of this exercice is to learn to train a neural network to perform a regression on a data set. |
||||
The data set is Auto MPG Dataset and the go is to build a model to predict the fuel efficiency of late-1970s and early 1980s automobiles. To do this, provide the model with a description of many automobiles from that time period. This description includes attributes like: cylinders, displacement, horsepower, and weight. |
||||
|
||||
https://www.tensorflow.org/tutorials/keras/regression |
||||
|
||||
|
||||
1. Preprocess the data set as follow: |
||||
- Drop the columns: **model year**, **origin**, **car name** |
||||
- Split train test without shuffling the data. Keep 20% for the test set. |
||||
- Scale the data using Standard Scaler |
||||
|
||||
|
||||
2. Train a neural network on the train set and predict on the test set. The neural network should have 2 hidden layers and the loss should be **mean_squared_error**. The expected **mean absolute error** on the test set is maximum 10. |
||||
*Hint*: inscrease the number of epochs |
||||
**Warning**: Do no forget to evaluate the neural network on the **SCALED** test set. |
||||
|
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the input DataFrames are: |
||||
|
||||
X_train_scaled shape is (313, 5) and the first 5 rows are: |
||||
|
||||
| | cylinders | displacement | horsepower | weight | acceleration | |
||||
|---:|------------:|---------------:|-------------:|---------:|---------------:| |
||||
| 0 | 1.28377 | 0.884666 | 0.48697 | 0.455708 | -1.19481 | |
||||
| 1 | 1.28377 | 1.28127 | 1.36238 | 0.670459 | -1.37737 | |
||||
| 2 | 1.28377 | 0.986124 | 0.987205 | 0.378443 | -1.55992 | |
||||
| 3 | 1.28377 | 0.856996 | 0.987205 | 0.375034 | -1.19481 | |
||||
| 4 | 1.28377 | 0.838549 | 0.737087 | 0.393214 | -1.74247 | |
||||
|
||||
The train target is: |
||||
|
||||
| | mpg | |
||||
|---:|------:| |
||||
| 0 | 18 | |
||||
| 1 | 15 | |
||||
| 2 | 18 | |
||||
| 3 | 16 | |
||||
| 4 | 17 | |
||||
|
||||
|
||||
X_test_scaled shape is (79, 5) and the first 5 rows are: |
||||
|
||||
| | cylinders | displacement | horsepower | weight | acceleration | |
||||
|----:|------------:|---------------:|-------------:|----------:|---------------:| |
||||
| 315 | -1.00255 | -0.554185 | -0.5135 | -0.113552 | 1.76253 | |
||||
| 316 | 0.140612 | 0.128347 | -0.5135 | 0.31595 | 1.25139 | |
||||
| 317 | -1.00255 | -1.05225 | -0.813641 | -1.03959 | 0.192584 | |
||||
| 318 | -1.00255 | -0.710983 | -0.5135 | -0.445337 | 0.0830525 | |
||||
| 319 | -1.00255 | -0.840111 | -0.888676 | -0.637363 | 0.813262 | |
||||
|
||||
The test target is: |
||||
|
||||
| | mpg | |
||||
|----:|------:| |
||||
| 315 | 24.3 | |
||||
| 316 | 19.1 | |
||||
| 317 | 34.3 | |
||||
| 318 | 29.8 | |
||||
| 319 | 31.3 | |
||||
|
||||
2. This question is validated if the mean absolute error on the test set is smaller than 10. Here is an architecture that works: |
||||
|
||||
``` |
||||
# create model |
||||
model = Sequential() |
||||
model.add(Dense(30, input_dim=5, activation='sigmoid')) |
||||
model.add(Dense(30, activation='sigmoid')) |
||||
model.add(Dense(1)) |
||||
# Compile model |
||||
model.compile(loss='mean_squared_error', |
||||
optimizer='adam', metrics='mean_absolute_error') |
||||
``` |
||||
|
||||
The output neuron has to be `Dense(1)` - by defaut the activation funtion is linear. The loss has to be **mean_squared_error** and the **input_dim** has to be **5**. All variations on the others parameters are accepted. |
||||
|
||||
*Hint*: To get the score on the test set, `evaluate` could have been used: `model.evaluate(X_test_scaled, y_test)`. |
||||
|
||||
# Exercice 3 Multi classification - Softmax |
||||
|
||||
The goal of this exercice is to learn to a neural network architecture for multi-class data. This is an important type of problem on which to practice with neural networks because the three class values require specialized handling. A multi-classification neural network uses as output layer a **softmax** layer. The **softmax** activation function is an extension of the sigmoid as it is designed to output the probabilities to belong to each class in a multi-class problem. This output layer has to contain as much neurons as classes in the multi-classification problem. This article explains in detail how it works. https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax |
||||
|
||||
Let us assume we want to classify images and we know they contain either apples, bears, candies, eggs or dogs (extension of the example in the link above). |
||||
|
||||
1. Create the architecture for a multi-class neural network with the following architecture and return `print(model.summary())`: |
||||
|
||||
- 5 inputs variables |
||||
- hidden layer 1: 16 neurons and sigmoid as activation function |
||||
- hidden layer 2: 8 neurons and sigmoid as activation function |
||||
- output layer: The number of neurons and the activation function should be adapted to this multi-classification problem. |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the code that creates the neural network is: |
||||
|
||||
``` |
||||
model = keras.Sequential() |
||||
model.add(Dense(16, input_shape=(5,), activation= 'sigmoid')) |
||||
model.add(Dense(8, activation= 'sigmoid')) |
||||
model.add(Dense(5, activation= 'softmax')) |
||||
|
||||
``` |
||||
# Exercice 4 Multi classification - Optimize |
||||
|
||||
The goal of this exercice is to learn to optimize a multi-classification neural network. As learnt previously, the loss function used in binary classification is the log loss - also called in Keras `binary_crossentropy`. This function is defined for binary classification and can be extended to multi-classfication. In Keras, the extended loss that supports multi-classification is `binary_crossentropy`. There's no code to run in that exercice. |
||||
|
||||
1. Fill the chunk of code below in order to optimize the neural network defined in the previous exercise. Choose the adapted loss, adam as optimizer and the accuracy as metric. |
||||
|
||||
``` |
||||
model.compile(loss='',#TODO1 |
||||
optimizer='', #TODO2 |
||||
metrics=['']) #TODO3 |
||||
``` |
||||
## Correction |
||||
|
||||
1. This question is validated if the chunk of code is: |
||||
|
||||
``` |
||||
model.compile(loss='categorical_crossentropy', |
||||
optimizer='adam', |
||||
metrics=['accuracy']) |
||||
``` |
||||
|
||||
# Exercice 5 Multi classification example |
||||
|
||||
The goal of this exercice is to learn to use a neural network to classify a multiclass data set. The data set used is the Iris data set which allows to classify flower given basic features as flower's measurement. |
||||
|
||||
Preliminary: |
||||
- Split train test. Keep 20% for the test set. Use `random_state=1`. |
||||
- Scale the data using Standard Scaler |
||||
|
||||
|
||||
1. Use the `LabelBinarizer` from Sckit-learn to create a one hot encoding of the target. As you know, the output layer of a multi-classification neural network shape is equal to the number of classes. The output layer expects to have a target with the same shape as its output layer. |
||||
|
||||
2. Train a neural network on the train set and predict on the test set. The neural network should have 1 hidden layers. The expected **accuracy** on the test set is minimum 90%. |
||||
*Hint*: inscrease the number of epochs |
||||
**Warning**: Do no forget to evaluate the neural network on the **SCALED** test set. |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output of the first ten values of the train labels are: |
||||
|
||||
``` |
||||
array([[0, 1, 0], |
||||
[0, 0, 1], |
||||
[0, 1, 0], |
||||
[0, 0, 1], |
||||
[0, 0, 1], |
||||
[1, 0, 0], |
||||
[0, 1, 0], |
||||
[1, 0, 0], |
||||
[0, 1, 0], |
||||
[0, 0, 1]]) |
||||
``` |
||||
|
||||
2. This question is validated if the accuracy on the test set is bigger than 90%. To evaluate the accuracy on the test set you can use: `model.evaluate(X_test_sc, y_test_multi_class)`. |
||||
|
||||
Here is an implementation that gives 96% accuracy on the test set. |
||||
|
||||
``` |
||||
model = Sequential() |
||||
model.add(Dense(10, input_dim=4, activation='sigmoid')) |
||||
model.add(Dense(3, activation='softmax')) |
||||
# Compile model |
||||
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) |
||||
model.fit(X_train_sc, y_train_multi_class, epochs = 1000, batch_size=20) |
||||
``` |
||||
|
||||
|
||||
|
||||
|
||||
# Exercice 6 GridSearch |
||||
|
||||
https://medium.com/@am.benatmane/keras-hyperparameter-tuning-using-sklearn-pipelines-grid-search-with-cross-validation-ccfc74b0ce9f |
@ -0,0 +1,230 @@
|
||||
# Week 3 D02 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
|
||||
# Introduction |
||||
|
||||
Embeddings ... |
||||
|
||||
Library: Spacy is a natural language processing (NLP) library for Python designed to have fast performance, and with word embedding models built in, it’s perfect for a quick and easy start. |
||||
|
||||
There are many type of language models pre-trained in Spacy. Each has its specificities depending on the hypothesis taken. |
||||
## Historical |
||||
|
||||
|
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
|
||||
# Exercice 1 Embedding 1 |
||||
|
||||
The goal of this exercice is to learn to load an embedding on SpaCy. |
||||
|
||||
1. Install and load `en_core_web_sm` embedding. Compute the embedding of `car`. |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the embedding's shape is `(96,)` |
||||
and the vector 20 first values are: |
||||
|
||||
``` |
||||
array([ 1.0522802e+00, 1.4806499e+00, 7.7402556e-01, 1.0373484e+00, |
||||
4.1474584e-01, -5.7604712e-01, 3.0856287e+00, 1.4814860e-01, |
||||
-3.0170975e+00, 3.4453702e+00, 6.3330579e-01, 1.1655847e+00, |
||||
3.8489954e+00, 2.3469532e+00, 5.0532556e-01, -1.9386177e+00, |
||||
9.7954911e-01, 2.3573284e+00, -1.9812435e-03, 5.5679207e+00], |
||||
dtype=float32) |
||||
|
||||
``` |
||||
|
||||
# Exercice 2: Tokenization |
||||
|
||||
|
||||
The goal of this exercice is to learn to tokenize a document using Spacy. We did this using NLTK yesterday. |
||||
|
||||
1. Tokenize the text below and print the tokens |
||||
|
||||
``` |
||||
text = "Tokenize this sentence. And this one too." |
||||
|
||||
``` |
||||
|
||||
## Correction |
||||
|
||||
1. The question is validated if the tokens printed are: |
||||
|
||||
``` |
||||
Tokenize |
||||
this |
||||
sentence |
||||
. |
||||
And |
||||
this |
||||
one |
||||
too |
||||
. |
||||
``` |
||||
|
||||
## Exercice 3 Embeddings 2 |
||||
|
||||
The goal of this exercice is to learn to use SpaCy embedding on a document. |
||||
|
||||
1. Compute the embedding of all the words in this sentence. The language model considered is `en_core_web_md` |
||||
|
||||
``` |
||||
"laptop computer coffee tea water liquid dog cat kitty" |
||||
``` |
||||
|
||||
2. Plot the pairwise cosine distances between all the words in a HeatMap. |
||||
|
||||
![alt text][logo] |
||||
|
||||
[logo]: w3day05ex1_plot.png "Plot" |
||||
|
||||
https://medium.com/datadriveninvestor/cosine-similarity-cosine-distance-6571387f9bf8 |
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the embeddings of each word has a shape of `(300,)` and if the first 20 values of the embedding of laptop are: |
||||
|
||||
``` |
||||
array([-0.37639 , -0.075521, 0.4908 , 0.19863 , -0.11088 , -0.076145, |
||||
-0.30367 , -0.69663 , 0.87048 , 0.54388 , 0.42523 , 0.18045 , |
||||
-0.4358 , -0.32606 , -0.70702 , -0.069127, -0.42674 , 2.4147 , |
||||
0.26806 , 0.46584 ], dtype=float32) |
||||
|
||||
``` |
||||
|
||||
2. This question is validated if the output is |
||||
|
||||
![alt text][logo] |
||||
|
||||
[logo]: w3day05ex1_plot.png "Plot" |
||||
|
||||
|
||||
# Exercice 4 Sentences' similarity |
||||
|
||||
The goal of this exerice is to learn to compute the similarity between two sentences. As explained in the documentation: **The word embedding of a full sentence is simply the average over all different words**. This is how `similarity` works in SpaCy. This small use case is very interesting because if we build a corpus of sentences that express an intention as **buy shoes**, then we can detect this intention and use it to propose shoes advertisement for customers. The language model used in this exercice is `en_core_web_sm`. |
||||
|
||||
|
||||
1. Compute the similarities (3 in total) between these sentences: |
||||
|
||||
``` |
||||
sentence_1 = "I want to buy shoes" |
||||
sentence_2 = "I would love to purchase running shoes" |
||||
sentence_3 = "I am in my room" |
||||
|
||||
``` |
||||
|
||||
## Correction |
||||
|
||||
2. This question is validated if the similarities between the sentences are: |
||||
|
||||
``` |
||||
sentence_1 <=> sentence 2 : 0.7073220863266589 |
||||
sentence_1 <=> sentence 3: 0.42663743263528325 |
||||
sentence_2 <=> sentence 3: 0.3336274235605957 |
||||
|
||||
``` |
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Exercice 5: NER |
||||
|
||||
The goal of this exercice is to learn to use a Named entity recognition algorithm to detect entities. |
||||
|
||||
``` |
||||
Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. It is considered one of the Big Five companies in the U.S. information technology industry, along with Amazon, Google, Microsoft, and Facebook. |
||||
Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sell Wozniak's Apple I personal computer, though Wayne sold his share back within 12 days. It was incorporated as Apple Computer, Inc., in January 1977, and sales of its computers, including the Apple I and Apple II, grew quickly. |
||||
``` |
||||
|
||||
1. Extract all named entities in the text as well as the label of the named entity. |
||||
|
||||
2. The NER is also useful to remove ambigous entities. From a conceptual standpoint, disambiguation is the process of determining the most probable meaning of a specific phrase. For example in the sentence below, the word `apple` is present twice in the sentence. The first time to mention the fruit and the second to mention a company. Run the NER on this sentence and print the named entity, the `start_char`, the `end_char` and the label of the named entity. |
||||
|
||||
``` |
||||
Paul eats an apple while watching a movie on his Apple device. |
||||
``` |
||||
https://en.wikipedia.org/wiki/Named-entity_recognition |
||||
## Correction |
||||
|
||||
1. This question is validated if the ouptut of the NER is |
||||
|
||||
``` |
||||
Apple Inc. ORG |
||||
American NORP |
||||
Cupertino GPE |
||||
California GPE |
||||
Five CARDINAL |
||||
U.S. GPE |
||||
Amazon ORG |
||||
Google ORG |
||||
Microsoft ORG |
||||
Facebook ORG |
||||
Apple ORG |
||||
Steve Jobs PERSON |
||||
Steve Wozniak PERSON |
||||
Ronald Wayne PERSON |
||||
April 1976 DATE |
||||
Wozniak PERSON |
||||
Apple ORG |
||||
Wayne PERSON |
||||
12 days DATE |
||||
Apple Computer, Inc. ORG |
||||
January 1977 DATE |
||||
Apple ORG |
||||
Apple II ORG |
||||
``` |
||||
2. This question is validated if the output shows that the first occurence of apple is not a named entity. In my case here is what the NER returns: |
||||
|
||||
``` |
||||
Paul 1 5 PERSON |
||||
Apple 50 55 ORG |
||||
|
||||
``` |
||||
|
||||
# Exercice 6 Part-of-speech tags |
||||
|
||||
The goal od this exercice is to learn to use the Part-of-speech tags (**POS TAG**) using Spacy. As explained in wikipedia, the POS TAG is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition and its context. |
||||
|
||||
Example |
||||
|
||||
The sentence: **"Heat water in a large vessel"** is tagged this way after the POS TAG: |
||||
- heat verb (noun) |
||||
- water noun (verb) |
||||
- in prep (noun, adv) |
||||
- a det (noun) |
||||
- large adj (noun) |
||||
- vessel noun |
||||
|
||||
|
||||
The data `news_amazon.txt` used is a news paper about Amazon. |
||||
|
||||
1. Return all sentences mentioning **Bezos** as a NNP (tag). |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the sentences outputed are: |
||||
|
||||
``` |
||||
INFO: Bezos PROPN NNP |
||||
Sentence: Amazon (AMZN) enters 2021 with plenty of big opportunities, but is losing its lauded Chief Executive Jeff Bezos, who announced his plan to step aside in the third quarter. |
||||
|
||||
|
||||
INFO: Bezos PROPN NNP |
||||
Sentence: Bezos will hand off his role as chief executive to Andy Jassy, the CEO of its cloud computing unit. |
||||
|
||||
|
||||
INFO: Bezos PROPN NNP |
||||
Sentence: He's not leaving, as Bezos will transition to the role of Executive Chairman and remain active. |
||||
|
||||
|
||||
INFO: Bezos PROPN NNP |
||||
Sentence: "When you look at our financial results, what you're actually seeing are the long-run cumulative results of invention," Bezos said in written remarks with the Amazon earnings release. |
||||
``` |
After Width: | Height: | Size: 33 KiB |
@ -0,0 +1,280 @@
|
||||
# W1D1 Piscine AI - Data Science |
||||
|
||||
|
||||
# Table of Contents: |
||||
|
||||
Reproduire cet article sans back prop |
||||
https://towardsdatascience.com/machine-learning-for-beginners-an-introduction-to-neural-networks-d49f22d238f9 |
||||
|
||||
# Introduction |
||||
|
||||
Deep learning is a huge domain. We will focus on Artificial Neural Networks. The goal is to understand how do the neural networks train and train them on data. Understand the challenges of training a neural network |
||||
Architectures as RNN, LSTM (learn sequences, used in TS and NLP),CNN used a lot in image processing are well know algorithms in deep learning but won't be covered by the AI branch. Once you have a good understanding of ANN feel free to extend your knowledge to new architectures. |
||||
|
||||
|
||||
## Rules |
||||
|
||||
## Ressources |
||||
https://victorzhou.com/blog/intro-to-neural-networks/ |
||||
|
||||
|
||||
https://srnghn.medium.com/deep-learning-overview-of-neurons-and-activation-functions-1d98286cf1e4#:~:text=What%20is%20a%20neuron%3F,to%20become%20the%20neuron's%20output. |
||||
|
||||
Reproduire cet article sans back prop |
||||
https://towardsdatascience.com/machine-learning-for-beginners-an-introduction-to-neural-networks-d49f22d238f9 |
||||
|
||||
# Exercice 1 The neuron |
||||
|
||||
The goal of this exercice is to understand the role of a neuron and to implement a neuron. |
||||
|
||||
An artificial neuron, the basic unit of the neural network, (also referred to as a perceptron) is a mathematical function. It takes one or more inputs that are multiplied by values called “weights” and added together. This value is then passed to a non-linear function, known as an activation function, to become the neuron’s output. |
||||
|
||||
As desbribed in the article, **a neuron takes inputs, does some math with them, and produces one output**. |
||||
|
||||
Let us assume there are 2 inputs. Here are the three steps involved in the neuron: |
||||
|
||||
1. Each input is multiplied by a weight |
||||
- x1 -> x1 * w1 |
||||
- x2 -> x2 * w2 |
||||
2. The weighted inputs are added together with a biais b |
||||
- (x1 * w1) + (x2 * w2) + b |
||||
3. The sum is passed through an activation function |
||||
- y = f((x1 * w1) + (x2 * w2) + b) |
||||
|
||||
- The activation function is a function you know from W2DAY2 (Logistic Regression): **the sigmoid** |
||||
|
||||
Example: |
||||
|
||||
x1 = 2 , x2 = 3 , w1 = 0, w2= 1, b = 4 |
||||
|
||||
1. Step 1: Multiply by a weight |
||||
- x1 -> 2 * 0 = 0 |
||||
- x2 -> 3 * 1 = 3 |
||||
2. Step 2: Add weigthed inputs and bias |
||||
- 0 + 3 + 4 = 7 |
||||
3. Step 3: Activation function |
||||
- y = f(7) = 0.999 |
||||
--- |
||||
1. Implement a the function feedforward of the class `Neuron` that takes as input the inputs (x1, x2) and that uses the attributes: the weights and the biais to return y: |
||||
|
||||
|
||||
``` |
||||
class Neuron: |
||||
def __init__(self, weight1, weight2, bias): |
||||
self.weights_1 = weight1 |
||||
self.weights_2 = weight2 |
||||
self.bias = bias |
||||
|
||||
def feedforward(self, x1, x2): |
||||
#TODO |
||||
return y |
||||
|
||||
|
||||
``` |
||||
|
||||
Note: if you are confortable with matrix multiplication, feel free to vectorize the operations as done in the article. |
||||
|
||||
https://victorzhou.com/blog/intro-to-neural-networks/ |
||||
|
||||
|
||||
## Correction: |
||||
|
||||
1. This question is validated if this code: |
||||
|
||||
``` |
||||
neuron = Neuron(0,1,4) |
||||
neuron.feedforward(2,3) |
||||
``` |
||||
|
||||
returns **0.9990889488055994**. |
||||
|
||||
|
||||
# Exerice 2 Neural network |
||||
|
||||
The goal of this exercice is to understand how to combine three neurons to form a neural network. A neural newtwork is nothing else than neurons connected together. As shown in the figure the neural network is composed of **layers**: |
||||
|
||||
- Input layer: it only represents input data. **It doesn't contain neurons**. |
||||
- Output layer: it represents the last layer. It contains a neuron (in some cases more than 1). |
||||
- Hidden layer: any layer between the input (first) layer and output (last) layer. Many hidden layers can be stacked. When there are many hidden layers, the neural networks is deep. |
||||
|
||||
Notice that the neuron **o1** in the output layer takes as input the output of the neurons **h1** and **h2** in the hidden layer. |
||||
|
||||
In exercice 1, you implemented this neuron. |
||||
![alt text][neuron] |
||||
|
||||
[neuron]: images/day1/ex2/w3_day1_neuron.png "Plot" |
||||
|
||||
Now, we add two more neurons: |
||||
|
||||
- h2, the second neuron of the hidden layer |
||||
- o1, the neuron of the output layer |
||||
|
||||
|
||||
![alt text][nn] |
||||
|
||||
[nn]: images/day1/ex2/w3_day1_neural_network.png "Plot" |
||||
|
||||
1. Implement the function `feedforward` of the class `OurNeuralNetwork` that takes as input the input data and returns the output y. Return the output for these neurons: |
||||
|
||||
``` |
||||
neuron_h1 = Neuron(1,2,-1) |
||||
neuron_h2 = Neuron(0.5,1,0) |
||||
neuron_o1 = Neuron(2,0,1) |
||||
``` |
||||
|
||||
``` |
||||
class OurNeuralNetwork: |
||||
|
||||
def __init__(self, neuron_h1, neuron_h2, neuron_o1): |
||||
self.h1 = neuron_h1 |
||||
self.h2 = neuron_h2 |
||||
self.o1 = neuron_o1 |
||||
|
||||
def feedforward(self, x1, x2): |
||||
# The inputs for o1 are the outputs from h1 and h2 |
||||
# TODO |
||||
return y |
||||
|
||||
``` |
||||
|
||||
|
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated the output is: **0.9524917424084265** |
||||
|
||||
# Exercice 3 Log loss |
||||
|
||||
The goal of this exercice is to implement the Log loss function. As mentioned last week, this function is used in classification as a **loss function**. It means that the better the classifier is, the smaller the loss function is. W2D1, you implemented the gradient descent on the MSE loss to update the weights of the linear regression. Similarly, the minimization of the Log loss leads to finding optimal weights. |
||||
|
||||
Log loss: - 1/n * Sum[(y_true*log(y_pred) + (1-y_true)*log(1-y_pred))] |
||||
|
||||
1. Create a function `log_loss_custom` and compute the loss for the data below: |
||||
|
||||
``` |
||||
y_true = np.array([0,1,1,0,1]) |
||||
y_pred = np.array([0.1,0.8,0.6, 0.5, 0.3]) |
||||
``` |
||||
Check that `log_loss` from `sklearn.metrics` returns the same result |
||||
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output is: **0.5472899351247816**. |
||||
|
||||
|
||||
# Exercice 4 Forward propagation |
||||
The goal of this exerice is to compute the log loss on the output of the forward propagation. The data used is the tiny data set below. |
||||
|
||||
|
||||
| name | math | chemistry | exam_success | |
||||
|:-------|-------:|------------:|---------------:| |
||||
| Bob | 12 | 15 | 1 | |
||||
| Eli | 10 | 9 | 0 | |
||||
| Tom | 18 | 18 | 1 | |
||||
| Ryan | 13 | 14 | 1 | |
||||
|
||||
|
||||
The goal if the network is to predict the success at the exam given math and chemistry grades. The inputs are `math` and `chemistry` and the target is `exam_sucess`. |
||||
|
||||
1. Compute and return the output of the neural network for each of the students. Here are the weights and biases of the neural network: |
||||
|
||||
``` |
||||
neuron_h1 = Neuron(0.05, 0.001, 0) |
||||
neuron_h2 = Neuron(0.02, 0.003, 0) |
||||
neuron_o1 = Neuron(2,0,0) |
||||
``` |
||||
2. Compute the logloss for the data given the output of the neural network with the 4 students. |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output is: |
||||
``` |
||||
Bob: 0.7855253278357536 |
||||
Eli: 0.7771516558846259 |
||||
Tom: 0.8067873659804015 |
||||
Ryan: 0.7892343955586032 |
||||
``` |
||||
2. This question is validated if the logloss for the 4 students is **0.5485133607757963**. |
||||
|
||||
|
||||
# Exercice 5 Regression |
||||
|
||||
The goal of this exercice is to learn to adapt the output layer to regression. |
||||
As a reminder, one of reasons for which the sigmoid is used in classification is because it contracts the output between 0 and 1 which is the expected output range for a probability (W2D2: Logistic regression). However, the output of the regression is not a probability. |
||||
|
||||
In order to perform a regression using a neural network, the activation function of the neuron on the output layer has to be modified to **identity function**. In mathematics, the identity function is: **f(x) = x**. In other words it means that it returns the input as so. The three steps become: |
||||
|
||||
|
||||
1. Each input is multiplied by a weight |
||||
- x1 -> x1 * w1 |
||||
- x2 -> x2 * w2 |
||||
2. The weighted inputs are added together with a biais b |
||||
- (x1 * w1) + (x2 * w2) + b |
||||
3. The sum is passed through an activation function |
||||
- y = f((x1 * w1) + (x2 * w2) + b) |
||||
- The activation function is **the identity** |
||||
- y = (x1 * w1) + (x2 * w2) + b |
||||
|
||||
All other neurons' activation function **doesn't change**. |
||||
|
||||
1. Adapt the neuron class implemented in exercice 1. It now takes as a parameter `regression` which is boolean. When its value is `True`, `feedforward` should use the identity function as activation function instead of the sigmoid function. |
||||
|
||||
|
||||
``` |
||||
class Neuron: |
||||
def __init__(self, weight1, weight2, bias, regression): |
||||
self.weights_1 = weight1 |
||||
self.weights_2 = weight2 |
||||
self.bias = bias |
||||
#TODO |
||||
|
||||
def feedforward(self, x1, x2): |
||||
#TODO |
||||
return y |
||||
|
||||
``` |
||||
|
||||
- Compute the output for: |
||||
|
||||
``` |
||||
neuron = Neuron(0,1,4, True) |
||||
neuron.feedforward(2,3) |
||||
``` |
||||
|
||||
|
||||
2. Now, the goal of the network is to predict the physics' grade at the exam given math and chemistry grades. The inputs are `math` and `chemistry` and the target is `physics`. |
||||
|
||||
| name | math | chemistry | physics | |
||||
|:-------|-------:|------------:|---------------:| |
||||
| Bob | 12 | 15 | 16 | |
||||
| Eli | 10 | 9 | 10 | |
||||
| Tom | 18 | 18 | 19 | |
||||
| Ryan | 13 | 14 | 16 | |
||||
|
||||
|
||||
Compute and return the output of the neural network for each of the students. Here are the weights and biases of the neural network: |
||||
|
||||
``` |
||||
#replace regression by the right value |
||||
neuron_h1 = Neuron(0.05, 0.001, 0, regression) |
||||
neuron_h2 = Neuron(0.002, 0.003, 0, regression) |
||||
neuron_o1 = Neuron(2,7,10, regression) |
||||
``` |
||||
3. Compute the MSE for the 4 students. |
||||
|
||||
## Correction |
||||
|
||||
1. This question is validated if the output is **7**. |
||||
|
||||
2. This question is validated if the outputs are: |
||||
|
||||
``` |
||||
Bob: 14.918863163724454 |
||||
Eli: 14.83137890625537 |
||||
Tom: 15.086662606964074 |
||||
Ryan: 14.939270885974128 |
||||
``` |
||||
|
||||
3. This question is validated if the MSE is **10.237608699909138** |
||||
|