1.先说下常规思路:
写追加模式,第一次写入头header,第二次开始:header =None
2.后来想了下,还是不够灵敏:id列不够随意,就用生成器来yield,不断批量batch_size 内存生成就返回,避免Memory Error:
d分两步:
第一步:
唯一id列:10w,sha256加密:
import csv
import uuid
from hashlib import sha256
import numpy as np
import argparse
import random,string
def gen_demo(rows:int, each_iter:int, features,mode_encrypt:int):
"""yield one batch_size data by each_iter size in case memory leak """
capacity = []
counter = 0
for i in range(rows):
temp = []
uid = str(uuid.uuid1()).replace("-", "")
if mode_encrypt:
encrypt = sha256(uid.encode("utf-8")).hexdigest()
else:
encrypt=uid+str(''.join(random.sample(string.ascii_letters+string.digits,10)))
if not features:
temp.append(encrypt)
else:
feature_value = np.around(np.random.normal(0, 1, features), decimals=5, out=None).tolist()
one_data = [encrypt] + feature_value
temp.extend(one_data)
capacity.append(temp)
counter += 1
if counter % each_iter == 0:
print(f" has gen {counter} data")
pass
yield capacity
def foo():
""" example: python generate_id.py -r 100000 -b 10000 -e 1"""
parse = argparse.ArgumentParser()
parse.add_argument("-e","--encrypt",type=int,required=True,help="encrypt 0 mean false,1 mean true ")
parse.add_argument("-r","--row",required=True,type=int,help="rows of samples")
parse.add_argument("-f","--features",required=False,type=int,help="number of cols generate features,if -f generate,else false")
parse.add_argument("-b","--batch_size",required=True,type=int,help="number of each time yield sample batch_size ")
parse.add_argument("-header","--header",required=False,type=bool,help="if generate header or not true mean yes, false no ")
parse.add_argument("-n","--name",required=False,type=str,help="the name of save output_data csv")
args = parse.parse_args()
if args.features:
features=args.features
else:
features=0
row = args.row
each_iter =args.batch_size
if args.name:
name=args.name
print("args name is %s"%name)
else:
name="id_sha256"
if args.header and features:
header = ["id"] + [f"x{i}" for i in range(features)]
else:
header=None # TODO if you want keep id_csv with id header, you can setting header=["id"]
encrypt_method=args.encrypt
data=gen_demo(row, each_iter, features, encrypt_method)
with open("./%s.csv"%name, "w", newline="")as f:
wr = csv.writer(f)
if header:
wr.writerow(header)
for dt in data:
wr.writerows(dt)
if __name__ == '__main__':
print("tip:\n",foo.__doc__)
foo()
第二步:generate data and write to csv :
import pandas as pd
import numpy as np
import csv
from threading import Lock,RLock
lock=RLock()
# TODO __author__ ="Chen"
""">>>>10wx1000columns cost 143.43s <<<<< 10wx10columns cost 2.02s"""
# 特征列
col = 10
# generate samples rows numbers,must be the same with id_sha256.csv id rows
totals_row = 100000
# 每次yield分批的写入save_data output数量样本,suggest 2000 or 5000 or 10000 ,
batch_size = 10000
# data_output path for guest or host data_set
target_path = "./breast_b.csv"
# id_csv path
id_csv_path = "./id_sha256.csv"
# id_csv_path = "./ids_1sw.csv" # todo test id range(100000) int64
# with label,生成数据是否带有label
label_switch=True
if batch_size > totals_row:
raise ValueError(f"batch_size number can't more than samples")
def yield_id():
data_set = pd.read_csv(id_csv_path, chunksize=batch_size, iterator=True, header=None)
counter = 0
for it in data_set:
size = it.shape[0]
# print(f"yield id size is {size}")
a = list(map(lambda x: x[0], it.values.tolist()))
counter += size
print(f" has gen {counter} data")
yield a
def concat(with_label):
ids = yield_id()
for id_list in ids: # todo len(id_list)=batch_size
df_id = pd.DataFrame(id_list,columns=["id"],dtype=np.str)
value_a=np.around(np.random.normal(0, 1, (batch_size, col)), decimals=5, out=None)
df_feature = pd.DataFrame(value_a,columns=[f"x{i}" for i in range(col)])
if with_label:
df_y = pd.DataFrame(np.random.choice(2, batch_size),dtype=np.int64,columns=["y"])
one_iter_data = pd.concat([df_id,df_y, df_feature], axis=1,ignore_index=False)#.values.tolist()
else:
one_iter_data = pd.concat([df_id, df_feature], axis=1,ignore_index=False)#.values.tolist()
yield one_iter_data
def save_data(path,with_label):
""" if with label then generate dataout_put.csv with label y
feature_list = [f"x{i}" for i in range(col)]
if with_label:
header = ["id"] +['y']+ feature_list
else:
header = ["id"] + feature_list
<<<<
# with open(path, "w", newline="")as f:
# wr = csv.writer(f)
# if header:
# wr.writerow(header)
# for dt in one_batch:
# wr.writerows(dt)
"""
one_batch = concat(with_label)
for index,df_dt in enumerate(one_batch):
if index==0:
print(df_dt.dtypes,"\n")
print(f"header of csv:\n{df_dt.columns.values.tolist()}")
df_dt.to_csv(path,index=False)
else:
df_dt.to_csv(path,index=False,mode="a",header=None)
if __name__ == '__main__':
"""
before you generate data_output.csv to $target_path,you should run generate_id.py
to gen $totals_row numbers of id.then execute 'python generate_id.py -r 100000 -b 10000 -e 1' to gen data_output
"""
import time
start=time.time()
idsha256=pd.read_csv(id_csv_path,header=None)
id_sha256_rows=idsha256.shape[0]
if totals_row == id_sha256_rows:
pass
else:
raise ValueError(f"Sample total rows is {totals_row} must be the same with id_sha256.csv id rows size:{id_sha256_rows}")
save_data(target_path,with_label=label_switch)
print(time.time()-start)
测试了下:10wx10 column cost 2 s.