#### pandas_dataframe1 ##########
import numpy as np
import pandas as pd
df = pd.DataFrame({
'A': np.random.randint(1, 100, 4),
'B': pd.date_range(start='20200401', periods=4, freq='D'),
'C': pd.Series(range(4),
index=["zhang", "li", "zhou", "wang"],
dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'
})
print(df)
# axis=0 纵向排序,axis=1 横向排序,ascending 默认为True,可以改为False
print(df.sort_index(axis=0,ascending=True))
print(df.sort_index(axis=0,ascending=False))
print(df.sort_index(axis=1,ascending=True))
print(df.sort_index(axis=1,ascending=False))
print(3 in df['C'])
print(3 in df['C'].values)
print(df[0:2])
print(df.loc[:,['A','C']])
print(df.at['zhang','A'])
print(df.loc[['zhang'],['A']])
print(df[df.E=='test'])
print(df[df['A'] > 50])
print(df[df['A'] > 50])
print(df.replace(3,5))
df1 = df.reindex(columns=list(df.columns) + ['G'])
print(df1)
df2 = df1['G'].fillna(5, inplace=True)
print(df2)
#### pandas_dataframe2 ##########
from idlelib.replace import replace
import numpy as np
import pandas as pd
from pandas.core.interchange.dataframe_protocol import DataFrame
data = pd.DataFrame(np.random.randn(500,4))
data.describe()
print(data.describe( ))
# # how = outer left right 不指定则默认为inner
# pd.merge(df1,df2,on='key',[how='outer' ])
df4 = pd.DataFrame({
'A':np.random.randint(1,5,8),
'B':np.random.randint(10,15,8),
'C':np.random.randint(20,30,8),
'D':np.random.randint(80,100,8),
})
print(df4)
print(df4.groupby(['A','B'],as_index=False).mean())
print(df4.groupby(['A']).sum())
print(df4.groupby('A').sum())
# df4.to_excel
# pd.read_excel
# 数据清晰案例
df = pd.read_csv()
print(df)
print(df['num_ber'])
print(df['num_ber'].isnull())
# 定义缺失值,重要
missing_values = ['n/a','na','--']
# 统一转换为NaN
df = pd.read_csv('./test_data',na_values= missing_values)
df = pd.read_csv('./test_data',na_values= missing_values)
print(df)
print(df['aa'])
print(df['aa'].isnull())
new_df = df.dropna()
new_df2 = df.fillna(1234, inplace=True)
print(new_df.to_string())
df['st_num'].mean()
df['st_num'].fillna(1234, inplace=True)
# 日期格式化
df['Date'] = pd.to_datetime(df['Date'],format='mixed')
# 转换一个字典为Dataframe,里面加上字典变量
df = pd.DataFrame()
for x in df.dex:
if df.loc[x,'age'] > 120:
df.loc[x,'age'] = 120
# 去重复 里面不跟参数
df.duplicated()
df.drop_duplicates()
# 数据相关性分析 常用method有pearson和spearman
# pearson -1 1 区间内,靠近1 则正相关越大,靠近-1 则负相关越大
df.corr(method="pearson", min_periods=1)
# 不加任何参数直接调用
df.corr()
# 可视化:
# 直接调用
df.plot()
# plt.show()
#### pandas_demo1 ##########
from operator import index
import numpy as np
import pandas as pd
# x = pd.Series([1,3,5,np.nan])
# print(x)
# bb = range(5)
# for i in bb:
# print(i)
# aa = pd.Series(range(5))
# print(aa.values)
# bb = pd.Series(range(5),index=list('abcde'))
# print(bb)
#
# cc = pd.date_range(start='20240101',end='20240131',freq='D')
# print(cc)
dates = pd.date_range(start='20240101',end='20240112',freq='D')
dd = pd.DataFrame(
np.random.randn(12,4),
index=dates,
columns=list('ABCD')
)
# print(dd)
# print(dd.head())
# print(dd.head(3))
# print(dd.tail(1))
print(dd)
# print(dd['A'].index)
# print(dd['A'].keys)
# print(dd['A'])
for i in dd['A']:
print(i)
# for i in dd['A'].values:
# print(i)
#### pandas_demo1 ##########
import pandas as pd
missing_values = ['n/a', 'na', '--']
data = pd.read_csv("test_data.csv", na_values=missing_values)
print(data)
print("##################################")
# 删除两列 "Unnamed:0"和"Remark"
del data['Remark']
del data['Unnamed:0']
print(data)
print("##################################")
# 删除所有带NaN的行,并赋值给new_data
# print(data.dropna())
new_data = data.dropna()
print(new_data)
print("##################################")
# 日期格式化 用第一个没有报错,但是时间不对,用第二个数据是对的,但是有警告提醒。
# new_data.loc[:, 'Date'] = pd.to_datetime(new_data['Date'], format='mixed')
# new_data['Date'] = pd.to_datetime(new_data['Date'], format='mixed')
# new_data['Date'] = pd.to_datetime(new_data['Date'], unit='s')
new_data.loc[:, 'Date'] = pd.to_datetime(new_data['Date'], format='mixed')
# new_data.loc[:, 'Date'] = pd.to_datetime(new_data['Date'], unit='s')
# 去掉$
new_data.loc[:, 'Payment'] = new_data['Payment'].str.replace('$', '')
# 将,转为为 .
new_data.loc[:, 'Payment'] = new_data['Payment'].str.replace(',', '.')
# 转换为浮点型
new_data.loc[:, 'Payment'] = new_data['Payment'].astype(float)
# 删除Note里的乱七八糟的字符
new_data.loc[:, 'Note'] = new_data['Note'].str.replace('?', '')
new_data.loc[:, 'Note'] = new_data['Note'].str.replace('-', '')
new_data.loc[:, 'Note'] = new_data['Note'].str.replace('!', '')
# 分隔符倒序数值
new_data.loc[:, 'Name'] = new_data['Name'].str.split(',').apply(lambda x: ' '.join(x[::-1]))
# 转换为小写
new_data.loc[:, 'Name'] = new_data['Name'].str.lower()
print(new_data)
#### pandas_serires ##############
import numpy as np
import pandas as pd
# Series标准的“数据”实例化语法
demo_datas = pd.Series(range(5),index=list('abcde'))
print(demo_datas)
# Series标准的“日期”实例化语法 H小时 D天 M月
demo_dates = pd.date_range(start='20240101',end='20240303',freq='D')
print(demo_dates)
# 怎么用用Series生成DataFrame?
x = pd.Series(['1,3,5,np.nan'])
# print(x.info)
# print(x.values)
#### test1 ##############
coding: utf-8
num = -5
if num > 0:
print("num为正数")
else:
print("num为负数")
print(abs(num))
nums = [1,2,3,4,5]
# print(sorted(nums,reverse=True))
print(nums.__reversed__())
help()
#
# print(max(nums))
a = 10
b = 3
print(round(a/b,2))
print(pow(a,b))
#
# for i in nums:
# print(i)
#
# total = 0
#
# while True:
# num = int(input("请输入一个数:"))
# total = total + num
# if num == 0:
# break
#
# print("综合为:",total)
import random as rd
aa = rd.randint(10)
print(aa)
nums = [1, 2, 3, 4, 5]
def test():
sum = 0
for i in nums:
sum = sum + i
return sum / len(nums)
print(test())
#### test2 ##############
# coding: utf-8
import numpy as np
matrix = np.random.rand(55)
print(matrix)
print(np.mean(matrix))
print(np.sum(matrix))
print(sum(sum(matrix)))
def test(param1):
print(param1)
return True
# test(123)
print(test("aa"))
def test2(param1,param2):
return True
print(test2(1,2))
for i in range(1,100,2):
print(i)
#### test3 ##############
class Rectangole:
def __init__(self, width, height):
self.width = width
self.height = height
def area(self):
return self.width * self.height
my_rect = Rectangole(5, 10)
print(my_rect.area())
class Animal:
def __init__(self, name):
self.name = name
def run(self):
print(self.name + " is running")
class Dog(Animal):
def bark(self):
print(self.name + " is barking")
d = Dog("Jack")
d.run()
d.bark()
########### test_data.csv ##############
Unnamed:0,Date,Name,Payment,Note,Remark
0,04/04/2021,"doe,john","$100,50",Unhappy!,
1,"april 17th,2021","Doe,Jane",$78.50 ,?Satidfied,
2,2021-8-21,"smith,Adam",$65 ,Neutral,
3,05/02/2021,"Tuck,matt",$120 ,Unhappy-,
4,05/12/2021,"James,Ben",,Neutral0,
5,2021-10-11,,$100 ,Neutral,
6,,,,,