#1. Create a list of words.

#2. Create a list of vowels.

#3. Create an output list to store the words with 3 or more vowels.

#4. Use a loop to iterate through our list of words.

    #4a. Use a counter to keep track of how many vowels are found in a word.
    
    #4b. Use a loop to iterate through the letters of each word.
        
        #If the letter exists in the list of vowels, add 1 to the counter.
            
    #c. If the counter is greater or equal to 3, add the word and number of vowels to our output list.

#1. Create a list of words.
my_words=["statement", "toy", "cars", "shoes", "ear", "busy", 
              "magnificent", "brainy", "healthy", "narrow", "join", 
              "decay", "dashing", "river", "gather", "stop", "satisfying", 
              "holistic", "reply", "steady", "event", "house", "amused", 
              "soak", "increase"]

#2. Create a list of vowels.
vowels=["a", "e", "i", "o", "u", "y"]

#3. Create an output list to store the words with 3 or more vowels.
output=[]

#4. Use a loop to iterate through our list of words.
for word in my_words:
    
    #4a. Use a counter to keep track of how many vowels are found in a word.
    count = 0
    
    #4b. Use a loop to iterate through the letters of each word.
    for char in word:
        
        #If the letter exists in the list of vowels, add 1 to the counter.
        if char in vowels:
            count = count + 1
            
    #c. If the counter finds more than 3 vowels, add the word and number of vowels 
    #to our output list.
    if count >= 3:
        output.append([word, count])

user_ages=["34", "27", "54", "19", "giraffe", "15", "83", "61", "43", "91", "sixteen"]

ages = []
problems = []

for age in user_ages:
    try:
        a = int(age)
        ages.append(a)
    except:
        problems.append(age)
        
print(ages)
print(problems)

[34, 27, 54, 19, 15, 83, 61, 43, 91]
['giraffe', 'sixteen']

def mean(number_list):
    s = sum(number_list)
    n = len(number_list)
    m = s/n
    return m

numbers = [1,23,89,5,67,3,1]
print(mean(numbers))

27.0

import pandas as pd

df = pd.read_csv("library_data.csv")

# Having trouble saving your file to the right location? Try uncommenting and running the line of code below.
#df = pd.read_csv("https://github.com/UNC-Libraries-data/Python/raw/main/Session2/library_data.csv")

df.shape

(5914, 7)

df.columns

Index(['id', 'library', 'town', 'town pop', 'title', 'genre', 'borrow_date'], dtype='object')

df.head()

df.sample(n=5)

df[0:3]

df["title"][5:10]

5    Secrets of the Old Manor
6          The Dragon's Crown
7                Time Paradox
8       Murder on Pine Street
9           The Perfect Crime
Name: title, dtype: object

df[["title", "genre", "borrow_date"]][20:24]

df.iloc[2,6]

'1/2/2020'

df.iloc[:,1] # All rows of column 1

0          Heritage Heights Library
1       Meadowbrook Commons Library
2            Riverside Reading Room
3          Heritage Heights Library
4           Lakeside Reading Center
                   ...             
5909         Riverside Reading Room
5910       Heritage Heights Library
5911       Heritage Heights Library
5912    Meadowbrook Commons Library
5913       Heritage Heights Library
Name: library, Length: 5914, dtype: object

df.iloc[0:3,:] # Rows 0-2 of all columns

df.iloc[120:126,1:4] # Rows 120-125 of columns 1-3

# How many books have been borrowed at each library?
df.library.value_counts()

library
Heritage Heights Library       1864
Riverside Reading Room         1853
Meadowbrook Commons Library    1239
Pine Valley Library             649
Lakeside Reading Center         309
Name: count, dtype: int64

# What books were borrowed on 6/23/2024?
df[df["borrow_date"] == "6/23/2024"]

# Count how often each book has been borrowed
top5 = df.title.value_counts()

# Get only the top 5
top5 = top5.head()

top5

title
Secrets of the Old Manor     392
The Bookshop Romance         305
Summer Hearts                283
The Detective's Last Case    252
Mathematics Made Simple      231
Name: count, dtype: int64

# load matplotlib
import matplotlib.pyplot as plt

# Create horizontal bar chart
plt.barh(y = top5.index, width = top5.values) # set the x and y axis
plt.gca().invert_yaxis() # display bars in descending order
plt.xlabel("Times Borrowed") # label the x axis
plt.ylabel("Title") # label the y axis
plt.title("Top 5 Most Borrowed Books") # create a chart title

Text(0.5, 1.0, 'Top 5 Most Borrowed Books')

# Group the df by library
top5bylib = df.groupby("library")

# Count the number of times each book is borrowed in each library. This gives us a series.
top5bylib = top5bylib.title.value_counts()

# Now that we have a series, we need to group it by library again and get the top 5 in each group
top5bylib = top5bylib.groupby("library").nlargest(5)

# Turn the series into a dataframe to make it easier to work with in Seaborn
top5bylib = top5bylib.reset_index(level=0, drop=True).to_frame().reset_index()

top5bylib.head(15)

# load seaborn
import seaborn as sns

# create a "grid" object using the FacetGrid function. 
grid = sns.FacetGrid(data = top5bylib, col = "library", col_wrap = 3, hue = "library", sharey = False, aspect = 1.5)

# specify which chart we want to use on the grid and supply the variables for the x and y axis.
fig = grid.map_dataframe(sns.barplot, y = "title", x = "count")

# Use a filter to select the title we want to focus on
somtrend = df[df["title"] == "Secrets of the Old Manor"]

# Count the number of times the book is borrowed on each date. This gives us a series.
somtrend = somtrend.borrow_date.value_counts()

# Turn the series into a dataframe
somtrend = somtrend.to_frame().reset_index()

# Give borrow_date a datetime format
somtrend["borrow_date"] = pd.to_datetime(somtrend["borrow_date"])

# Group and sum by month for a smoother line. This gives us a series again.
somtrend = somtrend.groupby(somtrend["borrow_date"].dt.to_period("M"))["count"].sum()

# Turn the series into a dataframe again
somtrend = somtrend.to_frame().reset_index()

# Sort dataframe in chronological order for Bokeh to display it correctly
somtrend = somtrend.sort_values(by="borrow_date")

somtrend.head()

# load bokeh modules
from bokeh.plotting import figure, show, output_notebook

# set up bokeh for working in jupyter notebooks
output_notebook()

# set up the size of our plot and format the x axis for dates
p = figure(height = 300, width = 600, x_axis_type = "datetime")

# add a line to our plot
p.line(source = somtrend, x = "borrow_date", y = "count", width = 2)

# show the plot
show(p)

import numpy as np
import pandas as pd

a_list = [[1,2],[3,4],[5,6],[7,8]] #list of ROWS
an_array = np.array(a_list, ndmin = 2)

an_array

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

np.random.rand(2,2)

array([[0.37438578, 0.47633254],
       [0.41351988, 0.08877861]])

from sklearn import linear_model, datasets
import matplotlib.pyplot as plt

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

import sys
import time
import polars as pl
import pandas as pd

# how long does it take Polars to load in a CSV?
start_pl = time.time()
df_pl = pl.read_csv("library_data.csv")
end_pl = time.time()
print(f'Seconds for Polars to load in the library_data CSV: {end_pl-start_pl}')

# how long does it take Pandas to load in a CSV?
start_pd = time.time()
df_pd= pd.read_csv("library_data.csv")
end_pd = time.time()
print(f'Seconds for Pandas to load in the library_data CSV: {end_pd-start_pd}')

# how much faster is Polars?
print(f'Polars is {round((end_pd-start_pd)/(end_pl-start_pl), 2)}x faster')

Seconds for Polars to load in the library_data CSV: 0.016093730926513672
Seconds for Pandas to load in the library_data CSV: 0.021875381469726562
Polars is 1.36x faster

# compare the memory size of the Polars and Pandas dataframes
print(f'The Polars dataframe takes up {sys.getsizeof(df_pl)} bytes.')
print(f'The Pandas dataframe takes up {sys.getsizeof(df_pd)} bytes.')

The Polars dataframe takes up 48 bytes.
The Pandas dataframe takes up 1972339 bytes.

import duckdb

duckdb.read_csv("library_data.csv")
duckdb.sql("SELECT id, library, town, title FROM 'library_data.csv' WHERE town = 'Riverdale' LIMIT 10")

┌───────┬────────────────────────┬───────────┬──────────────────────────┐
│  id   │        library         │   town    │          title           │
│ int64 │        varchar         │  varchar  │         varchar          │
├───────┼────────────────────────┼───────────┼──────────────────────────┤
│  3384 │ Riverside Reading Room │ Riverdale │ The Dragon's Crown       │
│  1449 │ Riverside Reading Room │ Riverdale │ Time Paradox             │
│   921 │ Riverside Reading Room │ Riverdale │ Murder on Pine Street    │
│   931 │ Riverside Reading Room │ Riverdale │ Murder on Pine Street    │
│  4122 │ Riverside Reading Room │ Riverdale │ Philosophy Today         │
│   886 │ Riverside Reading Room │ Riverdale │ Murder on Pine Street    │
│  2404 │ Riverside Reading Room │ Riverdale │ The Great American Novel │
│  2488 │ Riverside Reading Room │ Riverdale │ Echoes of Yesterday      │
│  5680 │ Riverside Reading Room │ Riverdale │ The Perfect Crime        │
│  5767 │ Riverside Reading Room │ Riverdale │ Night Watch              │
├───────┴────────────────────────┴───────────┴──────────────────────────┤
│ 10 rows                                                     4 columns │
└───────────────────────────────────────────────────────────────────────┘

from urllib.request import urlopen
from bs4 import BeautifulSoup

# Scrape Python 1 materials!
page = urlopen("https://unc-libraries-data.github.io/Python/Intro/Introduction.html")
html = page.read()

# Parse the HTML
soup = BeautifulSoup(html,"html.parser")
[x.text for x in soup.find_all("h2")] # find all second-level headers

['Why Python?¶',
 'Getting Started¶',
 'Data Types and Variables¶',
 'Flow Control¶',
 'More Data Types¶',
 'Review¶',
 'Pseudocode and Comments¶',
 'User-defined Functions¶',
 'Coming up¶',
 'References and Resources¶']

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text = "This is an example sentence. This sentence is a simple example." 

words = word_tokenize(text) # tokenize the text
fdist = FreqDist(words) # count how often each word occurs

# Show the frequencies
plt.figure()
plt.barh(fdist.keys(), fdist.values())
plt.xticks(range(1,3))
plt.xlabel('Frequency')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tuesday\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

Text(0.5, 0, 'Frequency')

from PIL import Image
from urllib.request import urlretrieve
from matplotlib.pyplot import imshow
from IPython.display import display

# download the unc logo
urlretrieve("https://identity.unc.edu/wp-content/uploads/sites/885/2019/01/UNC_logo_webblue-e1517942350314.png",
           "UNC_logo.png")

# open and display the image
UNC = Image.open("UNC_logo.png")
imshow(UNC)

<matplotlib.image.AxesImage at 0x13ffeb64f80>

# resize the image and make it grayscale
# note: // divides two numbers and rounds the result down to get an integer
UNC_gray = UNC.convert('LA').resize((UNC.width//2,UNC.height//2))
imshow(UNC_gray)

<matplotlib.image.AxesImage at 0x13ffd3c1e20>

	id	library	town	town pop	title	genre	borrow_date
4531	1332	Heritage Heights Library	Hilltop Springs	75000	Android's Promise	Science Fiction	11/21/2023
3634	1943	Riverside Reading Room	Riverdale	150000	The Bookshop Romance	Romance	3/5/2023
4805	2481	Heritage Heights Library	Hilltop Springs	75000	Echoes of Yesterday	Literary Fiction	2/4/2024
2516	168	Meadowbrook Commons Library	Meadowbrook	45000	The Vanishing at Midnight	Mystery	3/8/2022
2767	2024	Pine Valley Library	Pine Valley	25000	Summer Hearts	Romance	5/21/2022

	title	genre	borrow_date
20	The Perfect Crime	Thriller	1/6/2020
21	Secrets of the Old Manor	Mystery	1/6/2020
22	The Silent Witness	Thriller	1/7/2020
23	Wizard's First Rule	Fantasy	1/7/2020

	id	library	town	town pop	title	genre	borrow_date
5283	1271	Riverside Reading Room	Riverdale	150000	The Last Space Colony	Science Fiction	6/23/2024
5284	2906	Heritage Heights Library	Hilltop Springs	75000	The Victorian Secret	Historical Fiction	6/23/2024
5285	4864	Heritage Heights Library	Hilltop Springs	75000	The Inventor's Life	Biography	6/23/2024
5286	4551	Heritage Heights Library	Hilltop Springs	75000	Life of Einstein	Biography	6/23/2024
5287	3051	Meadowbrook Commons Library	Meadowbrook	45000	Ancient Promises	Historical Fiction	6/23/2024

	borrow_date	count
0	2020-01	10
1	2020-02	9
2	2020-03	8
3	2020-04	6
4	2020-05	10

Python: Session 2¶

Pseudocode and Comments¶

Pseudocode¶

Comments¶

Exception Handling: Try / Except¶

User-defined Functions¶

Using Other Packages¶

Working With Tabular Data in Pandas¶

Exploring a Data Frame¶

Attributes¶

Methods¶

Indexing¶

`.iloc`¶

Series¶

Filtering¶

Pandas Training on LinkedIn Learning¶

Data Visualization¶

Simple bar chart using Matplotlib¶

Faceted bar charts using Seaborn¶

Interactive Line Chart Using Bokeh¶

Other Helpful Libraries¶

Numpy¶

scikit-learn¶

Polars (dataframes for large-scale data processing)¶

DuckDB (for creating a SQL Database)¶

BeautifulSoup (for parsing HTML or XML data)¶

NLTK (for text analysis)¶

PIL (Pillow)¶

Parallel Processing with joblib¶

How to continue learning¶

	library	title	count
0	Heritage Heights Library	The Detective's Last Case	102
1	Heritage Heights Library	The Silent Witness	80
2	Heritage Heights Library	Secrets of the Old Manor	79
3	Heritage Heights Library	Murder on Pine Street	76
4	Heritage Heights Library	The Missing Manuscript	76
5	Lakeside Reading Center	Secrets of the Old Manor	68
6	Lakeside Reading Center	The Bookshop Romance	55
7	Lakeside Reading Center	Philosophy Today	42
8	Lakeside Reading Center	The Last Space Colony	40
9	Lakeside Reading Center	Life of Einstein	39
10	Meadowbrook Commons Library	The Vanishing at Midnight	82
11	Meadowbrook Commons Library	Deadly Deadline	80
12	Meadowbrook Commons Library	Secrets of the Old Manor	74
13	Meadowbrook Commons Library	The Victorian Secret	73
14	Meadowbrook Commons Library	The Silent Witness	69

Python: Session 2¶

Pseudocode and Comments¶

Pseudocode¶

Comments¶

Exception Handling: Try / Except¶

User-defined Functions¶

Using Other Packages¶

Working With Tabular Data in Pandas¶

Exploring a Data Frame¶

Attributes¶

Methods¶

Indexing¶

.iloc¶

Series¶

Filtering¶

Pandas Training on LinkedIn Learning¶

Data Visualization¶

Simple bar chart using Matplotlib¶

Faceted bar charts using Seaborn¶

Interactive Line Chart Using Bokeh¶

Other Helpful Libraries¶

Numpy¶

scikit-learn¶

Polars (dataframes for large-scale data processing)¶

DuckDB (for creating a SQL Database)¶

BeautifulSoup (for parsing HTML or XML data)¶

NLTK (for text analysis)¶

PIL (Pillow)¶

Parallel Processing with joblib¶

How to continue learning¶

`.iloc`¶