pandas 生成数据大数据-CFANZ编程社区

# coding=utf-8
import pandas as pd
import numpy as np
import uuid
from hashlib import sha256

# batch_size of each time write rows to id_sha256.csv
batch_size = 200000
# total_samples
total_samples = 10000000
# path_id csv
path_id_csv = "./id_sha256.csv"
# gen numeric,if numeric gen int64 to id_sha256.csv,False gen sha256  object of pandas.
numeric = True
# set header "id"
no_header = True


def foo(band):
    for index, v in enumerate(band):
        a, b = v[0], v[1]
        t = [k for k in range(a, b)]
        yield t


def value_sha(a, b):
    t = []
    if numeric:
        for k in range(a, b + 1):
            t.append(k)
    else:
        for i in range(a, b + 1):
            uid = str(uuid.uuid1()).replace("-", "")
            id_value = sha256(uid.encode("utf-8")).hexdigest()  # todo each time is same uid string,so need sha diff it
            t.append(id_value)
        # print(f"{index+1}次 length of sha_list is {len(t)},range is [{a},{b}]")
    return t


def gen_id(batch_size, samples):
    rangers = [[k, k + batch_size] for k in list(range(0, samples, batch_size))]
    generator = foo(rangers)  # <class.generator>
    for index, value in enumerate(generator):
        a, b = value[0], value[-1]
        v = value_sha(a, b)
        if numeric:
            df = pd.DataFrame(np.array(v), columns=["id"], dtype=np.int64)  # todo  set dtype=np.int64
        else:
            df = pd.DataFrame(np.array(v), columns=["id"])  # todo  set dtype=np.str
        if index == 0:
            print(df.dtypes)
            df = pd.DataFrame(np.array(v), columns=["id"])
            if no_header:
                df.to_csv(path_id_csv, index=False, header=None)
            else:
                df.to_csv(path_id_csv, index=False)
        else:
            df.to_csv(path_id_csv, index=False, header=None, mode="a")
        print(
            f"finish {index + 1}x{batch_size} row time write,value index range is [{value[0]},{value[-1]}],length of sha256msg is {len(value)}")


def check_set():
    df = pd.read_csv(path_id_csv)
    array = df.values.tolist()
    mp = list(map(lambda x: x[0], array))
    print(f"set {path_id_csv} sha256 id columns去重后行数：", len(list(set(mp))))


if __name__ == '__main__':
    import time

    start = time.time()
    gen_id(batch_size, total_samples)
    print(time.time() - start)
    print(f"<<<<<<<<<<finish gen {total_samples} rows sha256 id to {path_id_csv}<<<<<<<<<")
    # check_set()

使用sha256或者id range生成id列

gendata out 根据上述产生csv的id 列进行交集大数据

import pandas as pd
import numpy as np

__author__ = 'Chenquan'
# todo before you run generate_output.py,please run shamsg_unique.py to gen id col to csv first for read.
""">>>>10wx1000columns cost 143.43s <<<<< 10wx10columns cost 2.02s"""
# 特征列
col = 10

# generate samples rows numbers,must be the same with id_sha256.csv id rows
totals_row = 100000

# 每次yield分批的写入save_data output数量样本,suggest 2000 or 5000 or 10000 ,
batch_size = 20000

# data_output path for guest or host  data_set
target_path = "./breast_b.csv"

# id_csv path
id_csv_path = "./id_sha256.csv"  # todo id col support numeric and sha256 object type

# with label,生成数据是否带有label
label_switch = True
# data_set id column dtype,$id_csv_path id type is numeric set dtype=np.int64,else dtype=np.object
numeric = True

if batch_size > totals_row:
    raise ValueError(f"batch_size number can't more than samples")


def yield_id():
    data_set = pd.read_csv(id_csv_path, chunksize=batch_size, iterator=True, header=None)
    for it in data_set:
        a = list(map(lambda x: x[0], it.values.tolist()))
        yield a


def concat(with_label):
    ids = yield_id()
    for id_list in ids:  # todo len(id_list)=batch_size
        if numeric:
            id_type = np.int64
        else:
            id_type = None
        df_id = pd.DataFrame(id_list, columns=["id"], dtype=id_type)
        value_a = np.around(np.random.normal(0, 1, (batch_size, col)), decimals=5, out=None)
        df_feature = pd.DataFrame(value_a, columns=[f"x{i}" for i in range(col)])
        if with_label:
            df_y = pd.DataFrame(np.random.choice(2, batch_size), dtype=np.int64, columns=["y"])
            one_iter_data = pd.concat([df_id, df_y, df_feature], axis=1, ignore_index=False)
        else:
            one_iter_data = pd.concat([df_id, df_feature], axis=1, ignore_index=False)
        # print(one_iter_data)
        yield one_iter_data


def save_data(path, with_label):
    """ if with_label true then generate $target_path with label y column """
    one_batch = concat(with_label)
    for index, df_dt in enumerate(one_batch):
        if index == 0:
            print(df_dt.dtypes, "\n")
            print(f"header of csv:\n{df_dt.columns.values.tolist()}")
            df_dt.to_csv(path, index=False)
        else:
            df_dt.to_csv(path, index=False, mode="a", header=None)


if __name__ == '__main__':
    import time

    start = time.time()
    idsha256 = pd.read_csv(id_csv_path, header=None)
    id_sha256_rows = idsha256.shape[0]
    if totals_row == id_sha256_rows:
        pass
    else:
        raise ValueError(
            f"Sample total rows is {totals_row} must be the same with id_sha256.csv id rows size:{id_sha256_rows}")
    save_data(target_path, with_label=label_switch)
    print(time.time() - start)