user_ages=["34", "27", "54", "19", "giraffe", "15", "83", "61", "43", "91", "sixteen"]

ages = []
problems = []

for age in user_ages:
    try:
        a = int(age)
        ages.append(a)
    except:
        problems.append(age)
        
print(ages)
print(problems)

[34, 27, 54, 19, 15, 83, 61, 43, 91]
['giraffe', 'sixteen']

def mean(number_list):
    s = sum(number_list)
    n = len(number_list)
    m = s/n
    return m

numbers=list(range(1, 51))
print(mean(numbers))

25.5

import pandas as pd
import numpy as np

a0 = pd.DataFrame({"StudentID" : [1,2],
                   "GPA_change" : np.random.normal(0,1,2)}) #np.random.normal(0,1,2) pulls 2 random values from a Normal(0,1) distribution
a0

a1 = pd.DataFrame({"StudentID" : [3,4],
                   "GPA_change" : np.random.normal(0,1,2),
                   "Semester" : ["Spring", "Fall"]})
a1

pd.concat([a0,a1],axis=0,ignore_index=True)

b0 = pd.DataFrame({"name" : ["Marcos", "Crystal"],
                   "year" : [1993,1996]})
b1 = pd.DataFrame({"name" : ["Crystal", "Marcos"],
                   "proj_num" : [6,3]})

pd.concat([b0,b1], axis=1)

pd.merge(b0,b1)

#pd.merge(b0,b1, on = "name")
pd.merge(b0,b1, left_on = "name", right_on = "name")

b0.merge(b1)

b0.join(b1.set_index("name"), on="name")

c0 = pd.DataFrame({"name" : ["Marcos","Crystal","Devin","Lilly"],
                   "year" : [1993,1996,1985,2001]})
c1 = pd.DataFrame({"name" : ["Marcos","Crystal","Devin","Tamera"],
                   "project_num" : [6,3,9,8]})

pd.merge(c0,c1, how="inner") #default merge type - keeps only observations that appear in both DataFrames

pd.merge(c0,c1, how="left") #The "left" join keeps only the observations in the first named DataFrame

pd.merge(c0,c1, how="right") #The "right" join keeps only the observations in the second named DataFrame

pd.merge(c0,c1, how="outer") #The "full" or "outer" join keeps all observations

pd.merge(c0,c1, how="cross").head(10)

raw = pd.DataFrame({"City" : ["Raleigh","Durham","Chapel Hill"],
                    "2000" : np.random.normal(0,1,3).round(2),
                    "2001" : np.random.normal(0,1,3).round(2),
                    "2002" : np.random.normal(0,1,3).round(2),
                    "2003" : np.random.normal(0,1,3).round(2)})
raw

longer = raw.melt(id_vars = ["City"], value_vars = ["2000","2001","2002","2003"],
        var_name = "Year")
longer

wide_again = longer.pivot(index = "Year", columns = "City", values = "value")  #reset_index moves Year out of the index and back into the DataFrame proper
wide_again = wide_again.reset_index(col_level=1).rename_axis(columns={"City":None})  #cleanup to remove axis names - not always necessary

wide_again

#Create an empty list for storing chunks.
chunk_list = []

#Read in 1000 rows at a time and store only NC rows as separate chunks in chunk_list.
for chunk in pd.read_csv("CountyHealthData_2014-2015.csv", chunksize=1000, iterator=True):
    nc_rows = chunk[chunk["State"]=="NC"]
    chunk_list.append(nc_rows)

#Combine NC chunks into single data frame and view top rows.
nc_df = pd.concat(chunk_list, ignore_index=True)
nc_df.head(3)

import csv

list_of_lists = []
with open("CountyHealthData_2014-2015.csv","r") as csvfile:
    reader=csv.reader(csvfile)
    for row in reader:
        list_of_lists.append(row)

list_of_dicts = []
with open("CountyHealthData_2014-2015.csv","r") as csvfile:
    reader=csv.DictReader(csvfile)
    for row in reader:
        list_of_dicts.append(row)

print(list_of_lists[1])

['AK', 'West', 'Pacific', 'Aleutians West Census Area', '2016', '02016', 'Insuff Data', '1/1/2014', '', '0.122', '2.1', '2.1', '', '0.267', '0.3', '7.002', '0.234', '0.896', '0.266', '', '290.7', '21.1', '0.355', '91', '50', '99', '', '', '', '', '0.466', '0.091', '0.087', '', '0.289', '', '322.06', '', '', '0.03', '0.221', '0.272', '0', '5547', '0.078', '1', '0.067', '', '181', '', '', '0.17', '0.075', '', '', '0.374', '0.25', '3791', '0.185', '216', '69192', '0.127', '', '0.287']

print(list_of_dicts[1])

{'State': 'AK', 'Region': 'West', 'Division': 'Pacific', 'County': 'Aleutians West Census Area', 'FIPS': '2016', 'GEOID': '02016', 'SMS Region': 'Insuff Data', 'Year': '1/1/2015', 'Premature death': '', 'Poor or fair health': '0.122', 'Poor physical health days': '2.1', 'Poor mental health days': '2.1', 'Low birthweight': '0.04', 'Adult smoking': '0.267', 'Adult obesity': '0.329', 'Food environment index': '6.6', 'Physical inactivity': '0.22', 'Access to exercise opportunities': '0.896', 'Excessive drinking': '0.266', 'Alcohol-impaired driving deaths': '', 'Sexually transmitted infections': '288.4', 'Teen births': '21.6', 'Uninsured': '0.293', 'Primary care physicians': '36', 'Dentists': '73', 'Mental health providers': '163', 'Preventable hospital stays': '', 'Diabetic screening': '', 'Mammography screening': '', 'High school graduation': '', 'Some college': '0.474', 'Unemployment': '0.088', 'Children in poverty': '0.076', 'Income inequality': '3.907', 'Children in single-parent households': '0.289', 'Social associations': '9.014', 'Violent crime': '317.35', 'Injury deaths': '47.2', 'Air pollution - particulate matter': '', 'Drinking water violations': '0.026', 'Severe housing problems': '0.207', 'Driving alone to work': '0.347', 'Long commute - driving alone': '0', '2011 population estimate': '5511', 'Population that is not proficient in English': '0.08', 'Population living in a rural area': '1', 'Diabetes': '0.065', 'HIV prevalence rate': '', 'Premature age-adjusted mortality': '173.7', 'Infant mortality': '', 'Child mortality': '', 'Food insecurity': '0.173', 'Limited access to healthy foods': '0.075', 'Motor vehicle crash deaths': '', 'Drug poisoning deaths': '', 'Uninsured adults': '0.314', 'Uninsured children': '0.176', 'Health care costs': '4837', 'Could not see doctor due to cost': '0.185', 'Other primary care providers': '254', 'Median household income': '74088', 'Children eligible for free lunch': '0.133', 'Homicide rate': '', 'Inadequate social support': ''}

	StudentID	GPA_change	Semester
0	1	0.466944	NaN
1	2	-2.255585	NaN
2	3	0.585135	Spring
3	4	1.628179	Fall

	State	Region	Division	County	FIPS	GEOID	SMS Region	Year	Premature death	Poor or fair health	...	Drug poisoning deaths	Uninsured adults	Uninsured children	Health care costs	Could not see doctor due to cost	Other primary care providers	Median household income	Children eligible for free lunch	Homicide rate	Inadequate social support
0	NC	South	South Atlantic	Alamance County	37001	37001	Region 20	1/1/2014	7123.0	0.192	...	10.48	0.259	0.073	8640.0	0.167	46.0	41394	0.444	4.94	0.202
1	NC	South	South Atlantic	Alamance County	37001	37001	Region 20	1/1/2015	7291.0	0.192	...	12.38	0.249	0.088	9050.0	0.167	56.0	43001	0.455	4.60	NaN
2	NC	South	South Atlantic	Alexander County	37003	37003	Region 20	1/1/2014	7974.0	0.178	...	22.74	0.240	0.077	9316.0	0.205	30.0	39655	0.417	6.27	0.273

Extra Topics¶

Base Python¶

Try / Except - Robustness¶

User-defined Functions¶

Pandas¶

Merging and Concatenating Datasets¶

Concatenating Datasets¶

Merging¶

Reshaping Data¶

"Big Data" and iteration in pandas¶

Alternative: The `csv` package¶

Learn more¶

	name	year	project_num
0	Marcos	1993.0	6.0
1	Crystal	1996.0	3.0
2	Devin	1985.0	9.0
3	Lilly	2001.0	NaN
4	Tamera	NaN	8.0

	name_x	year	name_y	project_num
0	Marcos	1993	Marcos	6
1	Marcos	1993	Crystal	3
2	Marcos	1993	Devin	9
3	Marcos	1993	Tamera	8
4	Crystal	1996	Marcos	6
5	Crystal	1996	Crystal	3
6	Crystal	1996	Devin	9
7	Crystal	1996	Tamera	8
8	Devin	1985	Marcos	6
9	Devin	1985	Crystal	3

	City	2000	2001	2002	2003
0	Raleigh	-0.29	0.04	0.57	-0.22
1	Durham	0.85	2.31	0.88	-0.01
2	Chapel Hill	0.70	0.02	-0.58	0.03

	City	Year	value
0	Raleigh	2000	-0.29
1	Durham	2000	0.85
2	Chapel Hill	2000	0.70
3	Raleigh	2001	0.04
4	Durham	2001	2.31
5	Chapel Hill	2001	0.02
6	Raleigh	2002	0.57
7	Durham	2002	0.88
8	Chapel Hill	2002	-0.58
9	Raleigh	2003	-0.22
10	Durham	2003	-0.01
11	Chapel Hill	2003	0.03

Extra Topics¶

Base Python¶

Try / Except - Robustness¶

User-defined Functions¶

Pandas¶

Merging and Concatenating Datasets¶

Concatenating Datasets¶

Merging¶

Reshaping Data¶

"Big Data" and iteration in pandas¶

Alternative: The csv package¶

Learn more¶

Alternative: The `csv` package¶