Exploring The Dataset

3 minute read

Exploring the Dataset

1. List and NumPy

1.1 Dimension Control

# Creating arrays
vector = numpy.array([10, 20, 30])
matrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])
matrix_shape = matrix.shape

## Retrieve row/column(number of users and movies) in List
n_users, n_items = np.array(data).shape
# Data types
world_alcohol_dtype = world_alcohol.dtype
# Indexing arrays
uruguay_other_1986 = world_alcohol[1,4]
# Slicing arrays
countries = world_alcohol[:,2]
# Slicing one dimension
first_two_columns = world_alcohol[:,0:2]
# Slicing arrays
first_twenty_regions = world_alcohol[0:20,1:3]

## Fixing format before fitting in a classifier
#from [x,x,x] format to [[x][x][x]] bc it needs [] to cover multip dimensions
dates = np.reshape(dates,(len(dates), 1)) # converting to matrix of n X 1

1.2 Element Control

## Retrieve element in List (1by1, slow)
for tweet in public_tweets:
    print(tweet)

## Top3 element in vector
for x in top_items[:3]:
            print("        %s" % x)

## We can split a string into a list.
sample = "john,plastic,joe"
split_list = sample.split(",")
print(split_list)   #['john', 'plastic', 'joe']

#Here's another example.
string_two = "How much wood\ncan a woodchuck chuck\nif a woodchuck\ncould chuck wood?"
split_string_two = string_two.split('\n')
print(split_string_two)

# Retrieve specific words
for row in csvFileReader:
    dates.append(int(row[0].split('/')[0]))

## Replacing Special Characters
f = open("story.txt", 'r')
story_string = f.read()
clean_chars = [",", ".", "'", ";", "\n"]
def clean_text(text_string, special_characters):
    cleaned_string = text_string
    for string in special_characters:
        cleaned_string = cleaned_string.replace(string, "")
    cleaned_string = cleaned_string.lower() #all case-based characters have been lowercased.
    return(cleaned_string)
cleaned_story = clean_text(story_string, clean_chars)

## The In Statement ##
animals = ["cat", "dog", "rabbit", "horse", "giant_horrible_monster"]
cat_found = "cat" in animals  #True

1.3 Dictionary

## The In Statement and Dictionaries ##
planet_numbers = {"mercury": 1, "venus": 2, "earth": 3, "mars": 4}
jupiter_found = "jupiter" in planet_numbers #false
earth_found = "earth" in planet_numbers #true

## Counting with Dictionaries/Key count
pantry = ["apple", "orange", "grape", "apple", "orange", "apple", "tomato", "potato", "grape"]
pantry_counts = {}
for item in pantry:
    if item in pantry_counts:
        pantry_counts[item] = pantry_counts[item] + 1  #k exits, just add up v
    else:
        pantry_counts[item] = 1 #initiate a k-v

## Finding with conditions
top_male_names = []
male_name_counts = {}
for row in legislators:
    if row[3] == 'M' and row[7]>1940:
        if row[1] in male_name_counts:
            male_name_counts[row[1]] = male_name_counts[row[1]] + 1
        else:
            male_name_counts[row[1]] = 1

## Counting Simple Matches in the Data Set with re()
import re        
of_reddit_count = 0
for row in posts:
    if re.search("of [Rr]eddit", row[0]) is not None:
        of_reddit_count += 1

1.4 Sort

## Returns the indices that would sort an array.
x = np.array([2,4,1])
np.argsort(x)   #array([2, 0, 1])
np.argsort(-x)  #array([1, 0, 2])
## Return evenly spaced values within a given interval.
np.arange(3) #array([0, 1, 2])
np.arange(3,7) #array([3, 4, 5, 6])
np.arange(3,7,2) #array([3, 5])

2. List and Panda

2.1 Dimension Control

# Exploring the DataFrame
print(food_info.head(3))
dimensions = food_info.shape
print(dimensions)
num_rows = dimensions[0]
print(num_rows)
num_cols = dimensions[1]
print(num_cols)
# Data types
print(food_info.dtypes)
# Selecting a row
hundredth_row = food_info.loc[99]
# print("Rows 3, 4, 5 and 6")
print(food_info.loc[3:6])
# print("Rows 2, 5, and 10")
two_five_ten = food_info.loc[2,5,10]
# last row
last_row = fandango.shape[0] - 1
first_last = fandango.iloc[[first_row, last_row]]
# Selecting individual columns
ndb_col = food_info["NDB_No"]
# Selecting multiple columns by name
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
# Apply() Logic Over Columns
double_df = float_df.apply(lambda x: x*2)
print(double_df.head(1))
# Apply() Over Dataframe Rows
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_means = rt_mt_user.apply(lambda x: np.mean(x), axis=1)
print(rt_mt_means[0:5])

2.2 Element/Dictionary Control

# Data Manipulation with pd
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
print(water_energy[0:5])

# Normalizing Columns
max_protein = food_info["Protein_(g)"].max()
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()

## Creating a New Column ##
food_info["Normalized_Protein"] = normalized_protein

# Counting k with panda
import pandas as pd
all_ages = pd.read_csv("all-ages.csv")
print(all_ages.head(5)) # print first 5
#Unique values in Major_category column.
print(all_ages['Major_category'].unique())

aa_cat_counts = dict()
rg_cat_counts = dict()
def calculate_major_cat_totals(df):
    cats = df['Major_category'].unique()
    counts_dictionary = dict()

    for c in cats:
        major_df = df[df["Major_category"] == c]
        total = major_df["Total"].sum()
        counts_dictionary[c] = total
    return counts_dictionary

aa_cat_counts = calculate_major_cat_totals(all_ages)

2.3 Indexing/Sorting

## Custom Indexes
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
rt_scores = series_rt.values
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
# Reindexing
original_index = series_custom.index.tolist()
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
# Sorting
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
# Alignment
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2

## Sorting a DataFrame by a Column
food_info.sort_values("Norm_Nutr_Index", inplace=True, ascending=False)

Share on

Twitter Facebook Google+ LinkedIn

Phu Pham

Exploring the Dataset

1. List and NumPy

1.1 Dimension Control

1.2 Element Control

1.3 Dictionary

1.4 Sort

2. List and Panda

2.1 Dimension Control

2.2 Element/Dictionary Control

2.3 Indexing/Sorting

Share on