numpy、pandas数据处理-CFANZ编程社区

import os
import numpy as np
os.chdir('/Users/p/Desktop/Exercise/')
with open('alice.txt') as f:
    result = f.readlines()
    result = ' '.join(result)
    positive_list = ['good', 'nice', 'friendly', 'great', 'clean', 'comfortable', 'amazing', 'enjoyable', 'wonderful',
                     'great']
    negative_list = ['poor', 'bad', 'unfriendly', 'horrible', 'dirty', 'uncomfortable']
    my_string = result.replace(',', '')
    my_string = my_string.replace('.', '')
    my_string = my_string.replace('\n', ' ')
    total_words = my_string.split(' ')
    found = {}
    found.setdefault('positive', 0)
    found.setdefault('negative', 0)
    found.setdefault('other', 0)
    total_words.remove('')
    for value in total_words:
        if value in positive_list:
            found['positive'] += 1
        if value in negative_list:
            found['negative'] += 1
        if value not in positive_list and value not in negative_list:
            found['other'] += 1
    print('positive is:%f%%,negative is:%f%%,other is %f%%'%(found['positive'] / len(total_words) * 100,found['negative'] / len(total_words) * 100,found['other'] / len(total_words) * 100))

import  numpy as np
import pandas as pd
from pandas import Series,DataFrame
import os
os.chdir('/Users/p/Desktop/Exercise/')

df = pd.read_csv('Drinks.csv')
print(type(df))
print(df.shape)
print('数据集中的变量%d条数据，变量%d个：'%(df.shape[0],df.shape[1]))#数据的行与列数，因为计数是从0开始，所以虽然最后一行数据计数192，但实际是193行数据。

num_nan = df.isna().sum()                            #isna（）用于判断缺失值。
num_zero = (df == 0).sum()                          #df == 0用于判断0元素，将两者相加。
print('数据中缺失值（NaN或者0）的数量为：',(num_nan+num_zero).sum())

print('相应变量的中位数:\n',df.median(axis=0))      #先查看一下相应变量中位数
quant_keys = ["beer_servings", "spirit_servings",  "wine_servings", "total_litres_of_pure_alcohol"]
median = df.median(axis=0)
for k in quant_keys:
    df[k] = df[k].replace(np.nan, median[k])#用相应中位数替换缺失值和0值
print(df)
# beer_ave = {"AF": 0, "AS": 0, "EU": 0, "OC": 0, "SA": 0}
# wine_ave = {"AF": 0, "AS": 0, "EU": 0, "OC": 0, "SA": 0}
norepeat_df = df.drop_duplicates(subset=['continent',], keep='first')
beer_ave = {}
wine_ave = {}
key = norepeat_df['continent'].values
for k in key:
    beer_ave[k] = 0
    wine_ave[k] = 0
for k in beer_ave.keys():
    continent_df = df.loc[df["continent"] == k]
    beer_ave[k] = continent_df["beer_servings"].mean(axis=0)
    wine_ave[k] = continent_df["wine_servings"].mean(axis=0)
print('不同大陆(continent)的平均啤酒消耗(beer_servings)是：',beer_ave)
print('不同大陆平均紅酒消耗(wine_servings)是：',wine_ave)