# Importing all the libraries usefull for this notebook
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns


# 1 dimensional array
x_1 = np.array([1, 2, 3])
print(x_1, type(x_1))

[1 2 3] <class 'numpy.ndarray'>


print('x_1.shape', x_1.shape)  # 3 stands for the number of elements in one and only dimension
print('x_1.size', x_1.size)  # 3 represents the total number of elements
print('x_1.ndim', x_1.ndim)  # 1 represents the number of dimensions

x_1.shape (3,)
x_1.size 3
x_1.ndim 1


# 2 dimensional array
x_2 = np.array(
    [[1, 2, 3],
    [4, 5, 6]]
)
print(x_2, type(x_2))

[[1 2 3]
 [4 5 6]] <class 'numpy.ndarray'>


print('x_2.shape', x_2.shape)  # 2 stands for the number of rows, 3 for the number of columns
print('x_2.size', x_2.size)  # 6 represents the total number of elements
print('x_2.ndim', x_2.ndim)  # 2 represents the number of dimensions

x_2.shape (2, 3)
x_2.size 6
x_2.ndim 2


zero_array = np.zeros((3, 4))  # create an array of size 3x4 filled with zeros
print(zero_array)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# calculate the exponential value of each element in the array
# with the base e (mathematical constant)
exp_x_2 = np.exp(x_2)
print(exp_x_2)

[[  2.71828183   7.3890561   20.08553692]
 [ 54.59815003 148.4131591  403.42879349]]


# calculate the sum of elements in the array across columns (axis 1)
sum_axis_0 = np.sum(x_2, axis=0)
print('Sum across rows = ', sum_axis_0)

sum_axis_1 = np.sum(x_2, axis=1)
print('Sum across columns = ', sum_axis_1)

Sum across rows =  [5 7 9]
Sum across columns =  [ 6 15]


print('x_1 * x_1 =', x_1 * x_1)  # element-wise multiplication
print('x_2@x_1 = ', x_2@x_1)  # matrix multiplication, equivalent to .matmull()

x_1 * x_1 = [1 4 9]
x_2@x_1 =  [14 32]


# Create a dataframe from np.array
df_from_array = pd.DataFrame(x_2)
df_from_array.head()  # visualize the first n rows (default n=5)


df_from_array.columns = ['one', 'two', 'three']  # set column names
df_from_array.head()


# Create a dataframe from dictionary
df_from_array = pd.DataFrame(
    {
        'one': [1, 4],
        'two': [2, 5],
        'three': [3, 6]
    }
)
df_from_array.head()


df_from_array['four'] = [4, 7]  # add a new column

# create a column where new value is equal to
# the sum of the value in column one and column three:
df_from_array['sum_one_and_three'] = df_from_array.loc[:, 'one'] + df_from_array.loc[:, 'three']

# create a column where new value is log10 transformed
df_from_array['log10_one'] = np.log10(df_from_array.loc[:, 'one'])

df_from_array.head()


# define the core path
core_path = '/data/gardeux/Course_GG'


f_in = open(
    os.path.join(core_path, 'iris_dataset.csv'),  # path to file
    mode='r'  # mode for opening the file (see function details for more options)
)


print(f_in.readline())  # outputs the first line of the file
print(f_in.readline())  # outputs the second line of the file
print(f_in.readline())  # outputs the second line of the file

sepal_length,sepal_width,petal_length,petal_width,variety

5.1,3.5,1.4,.2,Setosa

4.9,3,1.4,.2,Setosa


f_in.close()  # it's necessary to close the file after you've finished processing it


f_in = open(
    os.path.join(core_path, 'iris_dataset.csv'),  # path to file
    mode='r'  # mode for opening the file (see function details for more options)
)
i = 0
for line in list(f_in.readlines()):  # iterate over each line
    # replace \n with an empty string and
    # split the line based on the separator
    print(line.replace('\n', '').split(','))
    i += 1
    if i == 3:  # to print only the first three lines
        break
f_in.close()  # close the file

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'variety']
['5.1', '3.5', '1.4', '.2', 'Setosa']
['4.9', '3', '1.4', '.2', 'Setosa']


f_in = open(
    os.path.join(core_path, 'iris_dataset.csv'),  # path to file
    mode='r'  # mode for reading the file (see function details for more options)
)
f_out = open(
    os.path.join(core_path, 'setosa_dataset_write.csv'),  # path to file
    mode='w'  # mode for writing the file (see function details for more options)
)
for i, line in enumerate(list(f_in.readlines())):  # iterate over each line in the input file will all data
    # replace \n with an empty string and
    # split the line based on the separator
    values_in_line = line.replace('\n', '').split(',')
    if i == 0:
        f_out.write(line)  # write the header to the output file
    if values_in_line[-1] == 'Setosa':  # if the last element in the list is Setosa
        f_out.write(line)  # write this line to the output file
f_in.close()  # close the input file
f_out.close()  # close the output file


# We use the pd.read_csv() function from Pandas to read files
iris_df = pd.read_csv(
    os.path.join(core_path, 'iris_dataset.csv'),  # specify path to file
    sep=','  # specify separator (default is comma)
)


iris_df.head()


# Subset only setosa species from the dataframe
setosa_df = iris_df[iris_df['variety'] == 'Setosa']


# We use the my_df.to_csv() function from Pandas to save dataframe to .csv
setosa_df.to_csv(
    os.path.join(core_path, 'setosa_dataset_pandas.csv'),  # specify path to file
    sep=',',  # specify separator (default is comma)
    index=False,  # indication not to write index
    header=True  # indication to write header (column names)
)


iris_df.head()


iris_df.describe()


# write your answer here


fig, ax = plt.subplots(1, 1, figsize=(5, 5))  # initialize a figure
ax.scatter(
    iris_df['petal_length'],  # plot petal length on the x axis 
    iris_df['petal_width'],  # plot petal width on the y axis
    color='lightgray'
)
ax.set_xlabel('Petal length', size=12)  # annotate x axis, specify the text size
ax.set_ylabel('Petal width', size=12)  # annotate y axis
plt.title('Petal length vs width', size=12)  # add a title to the plot and specify its size
plt.show()


correlation, corr_pvalue = scipy.stats.pearsonr(
    iris_df['petal_length'],
    iris_df['petal_width']
)
if corr_pvalue < 0.05:
    print(
        'Correlation =', correlation,
        'with the respective pvalue =', corr_pvalue)

Correlation = 0.9628654314027961 with the respective pvalue = 4.6750039073275495e-86


sns.pairplot(iris_df)

<seaborn.axisgrid.PairGrid at 0x1537645ada30>


# provide answers and the code in this cell


# Initialize a figure with two subplots sidy by side:
# fig, (ax1, ax2) = plt.subplots(1, 2)   # or as follows:
fig, ax = plt.subplots(
    1,  # number of rows with plots
    2,  # number of columns with plots
    figsize=(10, 5),  # full figure size, in our case each plot will occupy half of x axis
    sharex=True,  # set x axis to be the same for two plots
    sharey=True,  # set y axis to be the same for two plots
)
# in the first subplot, visualise the distribution of sepal lengths
ax[0].hist(
    iris_df['sepal_length'],
    bins=20,
    color='lightgray',
)
ax[0].set_xlabel('Sepal length', size=12)  # annotate x axis, specify the text size
ax[0].set_ylabel('Frequency', size=12)  # annotate y axis, specify the text size
ax[0].set_title('Sepal length distribution', size=12)  # add a title to the plot and specify its size

# in the second subplot, visualise the distribution of petal lengths
ax[1].hist(
    iris_df['petal_length'],
    bins=20,
    color='lightblue',
)
ax[1].set_xlabel('Petal length', size=12)
ax[1].set_title('Petal length distribution', size=12)
plt.show()


fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.hist(
    [iris_df['sepal_length'], iris_df['petal_length']],  # two datasets in one list
    bins=20,
    density=True,  # plot density value instead of frequencies
    # specify colors for the respective datasets
    # set dataset labels
)

# Indicate with vertical lines mean Sepal and Petal lengths respectively:
ax.axvline(
    np.mean(iris_df['sepal_length']),  # calculate the mean with NumPy
    color='black',
    label='Mean sepal length'
)
ax.axvline(
    np.mean(iris_df['petal_length']),  # calculate the mean with NumPy
    color='blue',
    label='Mean petal length'
)
# add x label
# add y label
# add title
# add legend

plt.show()


# provide the code in this cell


iris_df.groupby('variety').mean()


for iris_variety, variety_df in iris_df.groupby('variety'):
    print('-' * 20)
    print('Iris variety is', iris_variety)
    print(variety_df.head())
    # You can process the subsetted data in any way you want
    # in this loop

--------------------
Iris variety is Setosa
   sepal_length  sepal_width  petal_length  petal_width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa
--------------------
Iris variety is Versicolor
    sepal_length  sepal_width  petal_length  petal_width     variety
50           7.0          3.2           4.7          1.4  Versicolor
51           6.4          3.2           4.5          1.5  Versicolor
52           6.9          3.1           4.9          1.5  Versicolor
53           5.5          2.3           4.0          1.3  Versicolor
54           6.5          2.8           4.6          1.5  Versicolor
--------------------
Iris variety is Virginica
     sepal_length  sepal_width  petal_length  petal_width    variety
100           6.3          3.3           6.0          2.5  Virginica
101           5.8          2.7           5.1          1.9  Virginica
102           7.1          3.0           5.9          2.1  Virginica
103           6.3          2.9           5.6          1.8  Virginica
104           6.5          3.0           5.8          2.2  Virginica

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	sepal_length	sepal_width	petal_length	petal_width
variety
Setosa	5.006	3.428	1.462	0.246
Versicolor	5.936	2.770	4.260	1.326
Virginica	6.588	2.974	5.552	2.026

Overview of the important Python packages and modules¶

1. On new data types (NumPy array and Pandas dataframe)¶

1.1. NumPy arrays¶

1.2. Pandas dataframes¶

2. Reading files¶

2.1. Optional: Reading files line by line and writing to files¶

2.2. Reading and writing to files with Pandas¶

3. Analyzing and visualizing the data¶

	one	two	three	four	sum_one_and_three	log10_one
0	1	2	3	4	4	0.00000
1	4	5	6	7	10	0.60206