进阶数据操作(Dictionary&DataFrame)

List

数据单独列出来就是List

pop = [30.55, 2.77, 39.21]
countries = ["afghanistan", "albania", "algeria"] 
ind_alb = countries.index("albania")
ind_alb
1
pop[ind_alb]
2.77

list使用起来不是很方便。

Dictionary

两组list的数据对应起来的话就变成Dictionary了

world = {"afghanistan":30.55, "albania":2.77, "algeria":39.21}
world["albania"]
2.77

新增或者删除数值

world["sealand"] = 0.000028 
world
{'afghanistan': 30.55, 'albania': 2.81, 'algeria': 39.21, 'sealand': 2.8e-05}

del(world["sealand"]) world
{'afghanistan': 30.55, 'albania': 2.81, 'algeria': 39.21}

多层字典
说白了就是字典里面套字典

# Dictionary of dictionaries
europe = { 'spain': { 'capital':'madrid', 'population':46.77 },
           'france': { 'capital':'paris', 'population':66.03 },
           'germany': { 'capital':'berlin', 'population':80.62 },
           'norway': { 'capital':'oslo', 'population':5.084 } }


# Print out the capital of France
europe["france"]["capital"]

# Create sub-dictionary data
data={"capital":"rome","population":59.83}

# Add data to europe under key 'italy'
europe["italy"]=data

# Print europe
print(europe)
{'france': {'population': 66.03, 'capital': 'paris'}, 'italy': {'population': 59.83, 'capital': 'rome'}, 'germany': {'population': 80.62, 'capital': 'berlin'}, 'norway': {'population': 5.084, 'capital': 'oslo'}, 'spain': {'population': 46.77, 'capital': 'madrid'}}

DataFrame

数据分析里用的最多的其实还是DataFrame(数据框)，操作数据框的话会用到pandas工具包。

把字典转换成数据框
关键语法
import xx as x
pd.DataFrame()

# Pre-defined lists
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

# Import pandas as pd
import pandas as pd

# Create dictionary my_dict with three key:value pairs: my_dict
my_dict={"country":names,"drives_right":dr,"cars_per_cap":cpc}

# Build a DataFrame cars from my_dict: cars
cars=pd.DataFrame(my_dict)

# Print cars
print(cars)
   cars_per_cap        country  drives_right
0           809  United States          True
1           731      Australia         False
2           588          Japan         False
3            18          India         False
4           200         Russia          True
5            70        Morocco          True
6            45          Egypt          True

设置行名
cars.index=xxxxx

import pandas as pd

# Build cars DataFrame
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]
cars_dict = { 'country':names, 'drives_right':dr, 'cars_per_cap':cpc }
cars = pd.DataFrame(cars_dict,)
print(cars)

# Definition of row_labels
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']

# Specify row labels of cars
cars.index=row_labels


# Print cars again
print(cars)
   cars_per_cap        country  drives_right
0           809  United States          True
1           731      Australia         False
2           588          Japan         False
3            18          India         False
4           200         Russia          True
5            70        Morocco          True
6            45          Egypt          True
     cars_per_cap        country  drives_right
US            809  United States          True
AUS           731      Australia         False
JPN           588          Japan         False
IN             18          India         False
RU            200         Russia          True
MOR            70        Morocco          True
EG             45          Egypt          True

读取csv文件

cars= pd.read_csv("cars.csv",index_col = 0)

Pandas的简单操作

之后会花篇幅详细学习Pandas

选择列
[]和[[ ]]的区别，带列名和不带列名

print(cars["country"])
US     United States
AUS        Australia
JPN            Japan
IN             India
RU            Russia
MOR          Morocco
EG             Egypt
Name: country, dtype: object

print(cars[["country"]])
           country
US   United States
AUS      Australia
JPN          Japan
IN           India
RU          Russia
MOR        Morocco
EG           Egypt
print(cars[["country","drives_right"]])
           country  drives_right
US   United States          True
AUS      Australia         False
JPN          Japan         False
IN           India         False
RU          Russia          True
MOR        Morocco          True
EG           Egypt          True

选择行
可以选取指定行，这个和R很相似

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out first 3 observations
print(cars[0:3])

# Print out fourth, fifth and sixth observation
print(cars[3:6])
     cars_per_cap        country  drives_right
US            809  United States          True
AUS           731      Australia         False
JPN           588          Japan         False
     cars_per_cap  country  drives_right
IN             18    India         False
RU            200   Russia          True
MOR            70  Morocco          True
print(cars[0:3])
     cars_per_cap        country  drives_right
US            809  United States          True
AUS           731      Australia         False
JPN           588          Japan         False

<script.py> output:
         cars_per_cap        country  drives_right
    US            809  United States          True
    AUS           731      Australia         False
    JPN           588          Japan         False
         cars_per_cap  country  drives_right
    IN             18    India         False
    RU            200   Russia          True
    MOR            70  Morocco          True

loc and iloc

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out drives_right value of Morocco
print(cars.loc[["MOR","drives_right"]])

# Print sub-DataFrame
print(cars.loc[["RU","MOR"],["country","drives_right"]])

<script.py> output:
                  cars_per_cap  country drives_right
    MOR                   70.0  Morocco         True
    drives_right           NaN      NaN          NaN
         country  drives_right
    RU    Russia          True
    MOR  Morocco          True

and, or, not

看下面这个简单的例子就好,直接用and or

# Define variables
my_kitchen = 18.0
your_kitchen = 14.0

# my_kitchen bigger than 10 and smaller than 18?
print(my_kitchen > 10 and my_kitchen <10)

# my_kitchen smaller than 14 or bigger than 17?
print(my_kitchen<14 or my_kitchen>17)

# Double my_kitchen smaller than triple your_kitchen?
print(2*my_kitchen < 3*your_kitchen)

<script.py> output:
    False
    True
    True

not也是直接用not

In [1]:
x = 8
y = 9
not(not(x < 3) and not(y > 14 or y > 10))
Out[1]:
False

用numpy来进行类似的操作

np.logical_and
np.logical_or
np.logical_not

# Create arrays
import numpy as np
my_house = np.array([18.0, 20.0, 10.75, 9.50])
your_house = np.array([14.0, 24.0, 14.25, 9.0])

# my_house greater than 18.5 or smaller than 10
print(np.logical_or(my_house >18.5,my_house<10))

# Both my_house and your_house smaller than 11
print(np.logical_and(my_house<11,your_house<11))

得到结果

<script.py> output:
    [False  True False  True]
    [False False False  True]

if, elif, else

条件后面用的:分隔

area = 10.0
if(area < 9) :
    print("small")
elif(area < 12) :
    print("medium")
else :
    print("large")

比较一下，R语言里用的是{}

if (test_expression) {
statement1
} else {
statement2
}

Python的简洁易懂的优势就体现出来了。

再举个例子, Python可以没有规矩到条件不需要加()

# Define variables
room = "kit"
area = 14.0

# if-else construct for room
if room == "kit" :
    print("looking around in the kitchen.")
else :
    print("looking around elsewhere.")

# if-else construct for area
if area > 15 :
    print("big place!")
else :
    print("pretty small.")

# if statement for room
if room == "kit" :
    print("looking around in the kitchen.")

# if statement for area
if area>15 :
    print("big place!")

练习题

Remember about np.logical_and(), np.logical_or() and np.logical_not(), the Numpy variants of the and, or and notoperators? You can also use them on Pandas Series to do more advanced filtering operations.

Take this example that selects the observations that have a cars_per_cap between 10 and 80. Try out these lines of code step by step to see what's happening.

cpc = cars['cars_per_cap']
between = np.logical_and(cpc > 10, cpc < 80)
medium = cars[between]

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Import numpy, you'll need this
import numpy as np

# Create medium: observations with cars_per_cap between 100 and 500
cpc = cars['cars_per_cap']
between = np.logical_and(cpc > 100, cpc < 500)
medium = cars[between]

# Print medium
print(medium)

output

<script.py> output:
        cars_per_cap country  drives_right
    RU           200  Russia          True

while

语法结构,用:隔开

while condition :
    expression

x = 1
while x < 4 :
    print(x)
    x = x + 1

# Initialize offset
offset = 8

# Code the while loop
while offset != 0:
    print("correcting...")
    offset = offset-1
    print(offset)

output

<script.py> output:
    correcting...
    7
    correcting...
    6
    correcting...
    5
    correcting...
    4
    correcting...
    3
    correcting...
    2
    correcting...
    1
    correcting...
    0

while和if的混搭

# Initialize offset
offset = -6

# Code the while loop
while offset != 0 :
    print("correcting...")
    if offset > 0 :
      offset = offset -1
    else : 
      offset = offset + 1   
    print(offset)

for循环

List里的for循环

# areas list
areas = [11.25, 18.0, 20.0, 10.75, 9.50]

# Code the for loop
for i in areas :
    print(i)

enumerate, 可以同时输出变量的序列和数值

l = ['Alice', 'Bob', 'Charlie']

for name in l:
    print(name)
# Alice
# Bob
# Charlie

for i, name in enumerate(l):
    print(i, name)
# 0 Alice
# 1 Bob
# 2 Charlie

默认是从0开始，可以指定从1或者从任意数字开始。

for i, name in enumerate(l, 1):
    print(i, name)
# 1 Alice
# 2 Bob
# 3 Charlie

自定义输出格式

# areas list
areas = [11.25, 18.0, 20.0, 10.75, 9.50]

# Code the for loop
for index, area in enumerate(areas) :
    print("room " + str(index+1) + ": " + str(area))

output

room 1: 11.25
room 2: 18.0
room 3: 20.0
room 4: 10.75
room 5: 9.5

双层list的for循环也是毫不费力

# house list of lists
house = [["hallway", 11.25], 
         ["kitchen", 18.0], 
         ["living room", 20.0], 
         ["bedroom", 10.75], 
         ["bathroom", 9.50]]
         
# Build a for loop from scratch
for x, y in house:
    print("the " + str(x) + " is " + str(y) + " sqm" )

the hallway is 11.25 sqm
the kitchen is 18.0 sqm
the living room is 20.0 sqm
the bedroom is 10.75 sqm
the bathroom is 9.5 sqm

Dictionary里的for循环

和list不一样的是需要用到items()这个指令

# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin',
          'norway':'oslo', 'italy':'rome', 'poland':'warsaw', 'austria':'vienna' }
          
# Iterate over europe
for k, v in europe.items() :
    print("the capital of " + k + " is " + v)

Numpy array里的for循环

numpy array可以是1D也可以是2D，1D的话和list一样，2D的话要用到np.nditer,把两列按照一列来显示。

# Import numpy as np
import numpy as np  

# For loop over np_height
for x in np_height : 
    print(str(x) + " inches")

# For loop over np_baseball
for val in np.nditer(np_baseball) :
    print(val)

Dataframe里的for循环

iterrows()按照行来分析。

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
cars.head()

# Iterate over rows of cars
for lab, row in cars.iterrows() :
    print(lab)
    print(row)

output

cars.head()

     cars_per_cap        country  drives_right
US            809  United States          True
AUS           731      Australia         False
JPN           588          Japan         False
IN             18          India         False
RU            200         Russia          True

<script.py> output:
    US
    cars_per_cap              809
    country         United States
    drives_right             True
    Name: US, dtype: object
    AUS
    cars_per_cap          731
    country         Australia
    drives_right        False
    Name: AUS, dtype: object
    JPN
    cars_per_cap      588
    country         Japan
    drives_right    False
    Name: JPN, dtype: object
    IN
    cars_per_cap       18
    country         India
    drives_right    False
    Name: IN, dtype: object
    RU
    cars_per_cap       200
    country         Russia
    drives_right      True
    Name: RU, dtype: object
    MOR
    cars_per_cap         70
    country         Morocco
    drives_right       True
    Name: MOR, dtype: object
    EG
    cars_per_cap       45
    country         Egypt
    drives_right     True
    Name: EG, dtype: object

两种方法添加列

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
cars.head()

# Code for loop that adds COUNTRY column
for lab, row in cars.iterrows():
    cars.loc[lab,"COUNTRY"] = str.upper(row["country"])


# Print cars
print(cars)

2.apply

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
cars["country"]

# Use .apply(str.upper)
for lab, row in cars.iterrows() :
    cars["COUNTRY"] = cars["country"].apply(str.upper)

print(cars)