文章目录
基本概念
-
itertion
-
iterable
-
iterator
-
generator
迭代器
迭代器的内置方法
s = "alan"
si = iter(s)
l = [1, 2]
li = iter(l)
print(si) # <str_iterator object at 0x1061cdb70>
print(li) # <list_iterator object at 0x10a3142e0>
print(si.__iter__()) # <str_iterator object at 0x10c899b70> 可迭代对象有__iter__()方法
print(si.__next__()) # a 拥有__next__方法
print(si.__next__()) # l
print(si.__iter__() is si) # True __iter__返回自己
构造迭代器
# 自定义迭代器
class DataIter(object):
def __init__(self, *args):
self.data = list(args)
self.ind = 0
def __iter__(self): # 返回自身
return self
def __next__(self): # 返回数据
if self.ind == len(self.data):
raise StopIteration
else:
data = self.data[self.ind]
self.ind += 1
return data
obj = DataIter(1, 2, 3)
print(obj.__iter__()) # <__main__.DataIter object at 0x10f3cfe20>
print(obj.__next__()) # 1
print(obj.__next__()) # 2
print(obj.__next__()) # 3
# print(obj.__next__()) # 报错
进阶版迭代器
"""
next函数只能向前取数据,一次取一个,不能重复取数据,那这个可不可以解决呢?
iterator只能迭代一次,但是iterable对象则没有这个限制
可以把iterator从数据中分离出来,分别定义一个iterable与iterator如下
"""
class Data(object): # 只是iterable:可迭代对象而不iterator:迭代器
def __init__(self, *args):
self.data = list(args)
def __iter__(self): # 并没有返回自身
return DataIterator(self)
class DataIterator(object): # iterator: 迭代器
def __init__(self, data):
self.data = data.data
self.ind = 0
def __iter__(self):
return self
def __next__(self):
if self.ind == len(self.data):
raise StopIteration
else:
data = self.data[self.ind]
self.ind += 1
return data
if __name__ == '__main__':
d = Data(1, 2, 3)
for x in d:
print(x)
for x in d:
print(x)
"""
1
2
3
1
2
3
"""
生成器
生成器的内置方法
# 三元表达式构造的生成器
a = (x for x in range(10000000))
print(next(a)) # 0
print(next(a)) # 1
print(a.__next__()) # 2
print(a.__iter__() is a) # True
print(iter(a) is a) # True
# yield关键字构造的生成器
构造生成器①-“元组生成式”
a = [x for x in range(3)]
type(a) # list
a = (x for x in range(3))
type(a) # generator
比较列表生成式和生成器的耗时
import time
tic = time.time()
a = sum([x for x in range(10000000)])
toc = time.time()
print(toc - tic) # 2.3462159633636475
tic = time.time()
a = sum((x for x in range(10000000)))
toc = time.time()
print(toc - tic) # 0.847653865814209
构造生成器②-“yield关键字”
def num():
print('开始执行')
for i in range(5):
yield i
print('继续执行')
mygen = num()
print(type(mygen)) # <class 'generator'>
# next迭代获取
next(mygen)
# for循环获取
for step in mygen:
print(step)
实操
def get_table(**kwargs):
engine = db_client.datacenter_orm.get_engine()
schema = kwargs.get('schema')
table_name = kwargs.get('table')
metadata = MetaData(bind=engine)
try:
table_item = Table(table_name, metadata, autoload=True, schema=schema)
except:
table_item = Table(table_name.lower(), metadata, autoload=True, schema=schema)
return table_item
def get_batch(table_model, task_info):
sort_field = task_info.get("sort_field", "")
common_field = task_info.get("common_field", [])
batch_size = task_info.get("batch_size", 1000)
fields = []
for f in common_field:
fields.append(get_model_field(table_model, f))
fields = tuple(fields)
limit_offset = 0
while True:
session = db_client.datacenter_orm.new_session()
items = session.query(*fields) \
.order_by(get_model_field(table_model, sort_field)) \
.offset(limit_offset).limit(batch_size).all()
item_dict = [row2dict(item) for item in items]
session.close()
yield item_dict
if len(item_dict) < batch_size:
break
limit_offset += len(item_dict)
yield []
task_info = {
"common_field": ["id", "name"],
"sort_field": "id"
}
y_data = get_batch(get_table(self.conf), task_info)
for data_column in y_data:
...