import os
import numpy as np
os.chdir('/Users/p/Desktop/Exercise/')
with open('alice.txt') as f:
result = f.readlines()
result = ' '.join(result)
positive_list = ['good', 'nice', 'friendly', 'great', 'clean', 'comfortable', 'amazing', 'enjoyable', 'wonderful',
'great']
negative_list = ['poor', 'bad', 'unfriendly', 'horrible', 'dirty', 'uncomfortable']
my_string = result.replace(',', '')
my_string = my_string.replace('.', '')
my_string = my_string.replace('\n', ' ')
total_words = my_string.split(' ')
found = {}
found.setdefault('positive', 0)
found.setdefault('negative', 0)
found.setdefault('other', 0)
total_words.remove('')
for value in total_words:
if value in positive_list:
found['positive'] += 1
if value in negative_list:
found['negative'] += 1
if value not in positive_list and value not in negative_list:
found['other'] += 1
print('positive is:%f%%,negative is:%f%%,other is %f%%'%(found['positive'] / len(total_words) * 100,found['negative'] / len(total_words) * 100,found['other'] / len(total_words) * 100))
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import os
os.chdir('/Users/p/Desktop/Exercise/')
df = pd.read_csv('Drinks.csv')
print(type(df))
print(df.shape)
print('数据集中的变量%d条数据,变量%d个:'%(df.shape[0],df.shape[1]))#数据的行与列数,因为计数是从0开始,所以虽然最后一行数据计数192,但实际是193行数据。
num_nan = df.isna().sum() #isna()用于判断缺失值。
num_zero = (df == 0).sum() #df == 0用于判断0元素,将两者相加。
print('数据中缺失值(NaN或者0)的数量为:',(num_nan+num_zero).sum())
print('相应变量的中位数:\n',df.median(axis=0)) #先查看一下相应变量中位数
quant_keys = ["beer_servings", "spirit_servings", "wine_servings", "total_litres_of_pure_alcohol"]
median = df.median(axis=0)
for k in quant_keys:
df[k] = df[k].replace(np.nan, median[k])#用相应中位数替换缺失值和0值
print(df)
# beer_ave = {"AF": 0, "AS": 0, "EU": 0, "OC": 0, "SA": 0}
# wine_ave = {"AF": 0, "AS": 0, "EU": 0, "OC": 0, "SA": 0}
norepeat_df = df.drop_duplicates(subset=['continent',], keep='first')
beer_ave = {}
wine_ave = {}
key = norepeat_df['continent'].values
for k in key:
beer_ave[k] = 0
wine_ave[k] = 0
for k in beer_ave.keys():
continent_df = df.loc[df["continent"] == k]
beer_ave[k] = continent_df["beer_servings"].mean(axis=0)
wine_ave[k] = continent_df["wine_servings"].mean(axis=0)
print('不同大陆(continent)的平均啤酒消耗(beer_servings)是:',beer_ave)
print('不同大陆平均紅酒消耗(wine_servings)是:',wine_ave)