0
点赞
收藏
分享

微信扫一扫

数据清洗之 日期格式数据处理


日期格式数据处理

  • Pandas中使用to_datetime()方法将文本格式转换为日期格式
  • dataframe数据类型如果为datetime64,可以使用dt方法取出年月日等
  • 对于时间差数据,可以使用timedelta函数将其转换为指定时间单位的数值
  • 时间差数据,可以使用dt方法访问其常用属性

import pandas as pd
import numpy as np
import

os.getcwd()

'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据转换'

os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')

df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})

df.head(5)



user_id

auction_id

cat_id

cat1

property

buy_mount

day

0

786295544

41098319944

50014866

50022520

21458:86755362;13023209:3593274;10984217:21985...

2

20140919

1

532110457

17916191097

50011993

28

21458:11399317;1628862:3251296;21475:137325;16...

1

20131011

2

249013725

21896936223

50012461

50014815

21458:30992;1628665:92012;1628665:3233938;1628...

1

20131011

3

917056007

12515996043

50018831

50014815

21458:15841995;21956:3494076;27000458:59723383...

2

20141023

4

444069173

20487688075

50013636

50008168

21458:30992;13658074:3323064;1628665:3233941;1...

1

20141103

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 7 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.6+ MB

df['buy_date'] = pd.to_datetime(df['day'], format='%Y%m%d', errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 8 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
buy_date 29971 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 1.8+ MB

df.head(5)



user_id

auction_id

cat_id

cat1

property

buy_mount

day

buy_date

0

786295544

41098319944

50014866

50022520

21458:86755362;13023209:3593274;10984217:21985...

2

20140919

2014-09-19

1

532110457

17916191097

50011993

28

21458:11399317;1628862:3251296;21475:137325;16...

1

20131011

2013-10-11

2

249013725

21896936223

50012461

50014815

21458:30992;1628665:92012;1628665:3233938;1628...

1

20131011

2013-10-11

3

917056007

12515996043

50018831

50014815

21458:15841995;21956:3494076;27000458:59723383...

2

20141023

2014-10-23

4

444069173

20487688075

50013636

50008168

21458:30992;13658074:3323064;1628665:3233941;1...

1

20141103

2014-11-03

# 使用dt方法提取属性
# df['buy_date'].dt.year # 提取年
# df['buy_date'].dt.month # 提取月
# df['buy_date'].dt.day # 提取天

df['diff_day'] = pd.datetime.now() - df['buy_date']  # 时间差格式

df.head(5)



user_id

auction_id

cat_id

cat1

property

buy_mount

day

buy_date

diff_day

0

786295544

41098319944

50014866

50022520

21458:86755362;13023209:3593274;10984217:21985...

2

20140919

2014-09-19

2034 days 22:32:35.614788

1

532110457

17916191097

50011993

28

21458:11399317;1628862:3251296;21475:137325;16...

1

20131011

2013-10-11

2377 days 22:32:35.614788

2

249013725

21896936223

50012461

50014815

21458:30992;1628665:92012;1628665:3233938;1628...

1

20131011

2013-10-11

2377 days 22:32:35.614788

3

917056007

12515996043

50018831

50014815

21458:15841995;21956:3494076;27000458:59723383...

2

20141023

2014-10-23

2000 days 22:32:35.614788

4

444069173

20487688075

50013636

50008168

21458:30992;13658074:3323064;1628665:3233941;1...

1

20141103

2014-11-03

1989 days 22:32:35.614788

df.dtypes

user_id                object
auction_id int64
cat_id int64
cat1 int64
property object
buy_mount int64
day int64
buy_date datetime64[ns]
diff_day timedelta64[ns]
dtype: object

# 使用dt方法提取属性
# df['diff_day'].dt.days # 提取天数
# df['diff_day'].dt.seconds # 提取秒
# df['diff_day'].dt.microseconds # 提取纳秒

# 将时间差转换为规定的格式
df['时间差'] = df['diff_day']/pd.Timedelta('1 D') # 转换为天数

df['时间差'].head(5)

0    2034.939301
1 2377.939301
2 2377.939301
3 2000.939301
4 1989.939301
Name: 时间差, dtype: float64

df['时间差'] = df['diff_day']/pd.Timedelta('1 H')  # 转换为小时

df['时间差'].head(5)

0    48838.543226
1 57070.543226
2 57070.543226
3 48022.543226
4 47758.543226
Name: 时间差, dtype: float64

df['时间差'] = df['diff_day']/pd.Timedelta('1 M')  # 转换为分钟数

df['时间差'].head(5)

0    2.930313e+06
1 3.424233e+06
2 3.424233e+06
3 2.881353e+06
4 2.865513e+06
Name: 时间差, dtype: float64

# 将科学计数转换为小数
df['时间差'].head(5).round(decimals=3)

0    2930312.594
1 3424232.594
2 3424232.594
3 2881352.594
4 2865512.594
Name: 时间差, dtype: float64

# D: 天  M: 月  Y: 年
# 转换为天数
df['diff_day'].astype('timedelta64[D]').head(5)

0    2034.0
1 2377.0
2 2377.0
3 2000.0
4 1989.0
Name: diff_day, dtype: float64

# 转换为月数
df['diff_day'].astype('timedelta64[M]').head(5)

0    66.0
1 78.0
2 78.0
3 65.0
4 65.0
Name: diff_day, dtype: float64

# 转换为年数
df['diff_day'].astype('timedelta64[Y]').head(5)

0    5.0
1 6.0
2 6.0
3 5.0
4 5.0
Name: diff_day, dtype: float64

# 转换为小时
# 还可转换为分钟、秒等等
df['diff_day'].astype('timedelta64[h]').head(5)

0    48838.0
1 57070.0
2 57070.0
3 48022.0
4 47758.0
Name: diff_day, dtype: float64


举报

相关推荐

0 条评论