import pandas as pd
# Create a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
import pandas as pd
import numpy as np
# Create a DataFrame
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)
print(df)
loc
method to select rows and columns by label.print(df.loc[0, 'Name']) # Select the 'Name' of the first row
iloc
method to select rows and columns by integer position.print(df.iloc[0, 1]) # Select the element in the first row and second column
dropna()
and fillna()
to handle missing values.import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3], 'B': [4, 5, np.nan]})
# Drop rows with missing values
df_dropped = df.dropna()
print(df_dropped)
# Fill missing values with a specific value
df_filled = df.fillna(0)
print(df_filled)
drop_duplicates()
method can be used to remove duplicate rows from a DataFrame.df = pd.DataFrame({'A': [1, 2, 2], 'B': [3, 4, 4]})
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)
get_dummies()
method can be used for one - hot encoding.df = pd.DataFrame({'Color': ['Red', 'Blue', 'Green']})
df_encoded = pd.get_dummies(df)
print(df_encoded)
scikit - learn
for scaling numerical variables.from sklearn.preprocessing import MinMaxScaler
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print(df_scaled)
describe()
method to get summary statistics of numerical columns in a DataFrame.df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.describe())
matplotlib
or seaborn
for data visualization.import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.plot(kind='bar')
plt.show()
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df['C'] = df['A'] + df['B']
print(df)
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]})
df['A'] = pd.to_numeric(df['A'], downcast='integer')
print(df.info())
df = pd.DataFrame({'A': [1, np.nan, 3], 'B': [4, 5, np.nan]})
result = df.dropna().reset_index(drop=True)
print(result)
Pandas is an indispensable tool for machine learning preprocessing. Its rich set of data structures, indexing methods, and data manipulation functions make data cleaning, transformation, and exploration efficient and straightforward. By mastering the fundamental concepts, usage methods, common practices, and best practices of Pandas, machine learning practitioners can preprocess their data more effectively, leading to better - performing machine learning models.