0
点赞
收藏
分享

微信扫一扫

使用pm4py库读取xes文件

千白莫 2022-04-08 阅读 35
python

pm4py官方文档:PM4Py - Process Mining for PythonPM4Py is a process mining package for Python. PM4Py implements the latest, most useful, and extensively tested methods of process mining. The practical handling makes the introduction to the world of process mining very pleasant.https://pm4py.fit.fraunhofer.de/

读取xes文件

直接读取

from pm4py.objects.log.importer.xes import importer as xes_importer
log = xes_importer.apply('<path_to_xes_file.xes>')  # 引号中的为文件地址

将xes中的数据按时间戳排序后读取

from pm4py.objects.log.importer.xes import importer as xes_importer
variant = xes_importer.Variants.ITERPARSE
parameters = {variant.value.Parameters.TIMESTAMP_SORT: True}
log = xes_importer.apply('<path_to_xes_file>',  # 引号中的为文件地址
                         variant=variant, parameters=parameters)

访问xes文件

xes文件的数据形式为字典形式的列表,整个文件可以理解为一个二维数组([轨迹长度][单个轨迹中的事件数量])

(下方代码使用的数据集为BPIC15_1.xes)

print(log[0]) #prints the first trace of the log
'''
{'attributes': {'Includes_subCases': 'N', 'concept:name': '2760925', 'Responsible_actor': '4901428', 'endDate': datetime.datetime(2014, 6, 10, 14, 13, 27, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'caseStatus': 'G', 'parts': 'Inrit/Uitweg,Inrit/Uitweg', 'last_phase': 'Buiten behandeling gelaten', 'case_type': '557669', 'startDate': datetime.datetime(2010, 10, 5, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'requestComplete': 'TRUE', 'IDofConceptCase': '2760933'}, 'events': [{'question': '5-10-2010 0:00:00', 'dateFinished': '2010-10-07 14:57:22', 'dueDate': datetime.datetime(2010, 10, 10, 14, 43, 8, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'action_code': '01_HOOFD_010', 'activityNameEN': 'register submission date request', 'planned': datetime.datetime(2010, 10, 8, 14, 43, 8, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'time:timestamp': datetime.datetime(2010, 10, 5, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'monitoringResource': '560894', 'org:resource': '560872', 'activityNameNL': 'registratie datum binnenkomst aanvraag', 'concept:name': '01_HOOFD_010', 'lifecycle:transition': 'complete'}, '..', {'monitoringResource': '560894', 'org:resource': '560872', 'activityNameNL': 'fase aanvraag ontvangen', 'concept:name': '01_HOOFD_015', 'question': 'EMPTY', 'dateFinished': '2010-10-07 14:57:22', 'action_code': '01_HOOFD_015', 'activityNameEN': 'phase application received', 'lifecycle:transition': 'complete', 'time:timestamp': datetime.datetime(2010, 10, 7, 14, 57, 14, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)))}]}
'''
print(log[0][0]) #prints the first event of the first trace
'''
{'question': '5-10-2010 0:00:00', 'dateFinished': '2010-10-07 14:57:22', 'dueDate': datetime.datetime(2010, 10, 10, 14, 43, 8, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'action_code': '01_HOOFD_010', 'activityNameEN': 'register submission date request', 'planned': datetime.datetime(2010, 10, 8, 14, 43, 8, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'time:timestamp': datetime.datetime(2010, 10, 5, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'monitoringResource': '560894', 'org:resource': '560872', 'activityNameNL': 'registratie datum binnenkomst aanvraag', 'concept:name': '01_HOOFD_010', 'lifecycle:transition': 'complete'}
'''
print(dict(event_log[0].attributes).keys())  # attributes中存放了对应部分中所存储的数据名称(整个文件以及单个轨迹中有attributes,单个事件中没有attributes)
'''
dict_keys(['Includes_subCases', 'concept:name', 'Responsible_actor', 'endDate', 'caseStatus', 'parts', 'last_phase', 'case_type', 'startDate', 'requestComplete', 'IDofConceptCase'])
'''
print(dict(event_log[0][0]).keys())  # 单个事件可以直接转化为字典查看数据名称
'''
dict_keys(['question', 'dateFinished', 'dueDate', 'action_code', 'activityNameEN', 'planned', 'time:timestamp', 'monitoringResource', 'org:resource', 'activityNameNL', 'concept:name', 'lifecycle:transition'])
'''
print(event_log[0][0]["concept:name"])  # 可以直接访问数据名称获取对应数据
'''
01_HOOFD_010
'''

实例

本实例的目标是统计数据集中的一系列信息:

targets = [
    '数据集',
    'Case数量',
    'Activity数量',
    'Event数量',
    '日志开始时间',
    '日志结束时间',
    'Case内最大Event数量',
    'Case内最小Event数量',
    'Case内平均Event数量',
    'Case最长持续时间',
    'Case最短持续时间',
    'Case平均持续时间'
]

表头文件样式(DataStatistic.xlsx):

 

首先使用前面读取xes的方法读取文件

def read_xes(file_address):
    variant = xes_importer.Variants.ITERPARSE
    parameters = {variant.value.Parameters.TIMESTAMP_SORT: True}
    log = xes_importer.apply(file_address, variant=variant, parameters=parameters)
    return log

对读取的事件日志进行处理

def data_processing_event_log(event_log, targets_dict):  # 输入为一个日志,一个带有['数据集']属性的字典(该方法不获取数据集名称,要在方法外设置好使用该方法填充)
    Case_num = len(event_log)  # 获取轨迹数量
    # print(Case_num)

    # 初始化
    Activity_set = set()  # 使用一个set来统计有多少种活动(set内数据不重复)
    Event_num = 0  # Event数量
    startDate = None  # 日志开始时间
    endDate = None  # 日志结束时间
    max_Event_num = 0  # Case内最大Event数量
    min_Event_num = 0x3f3f3f3f  # Case内最小Event数量
    mean_Event_num = 0  # Case内平均Event数量
    Case_Max_Duration = None  # Case最长持续时间
    Case_Min_Duration = None  # Case最短持续时间
    Case_mean_Duration = None  # Case平均持续时间
    for i in range(len(event_log)):
        trace = event_log[i]  # 单个轨迹
        trace_len = len(trace)  # 单个轨迹中的事件数量
        if max_Event_num < trace_len: max_Event_num = trace_len
        if min_Event_num > trace_len: min_Event_num = trace_len
        mean_Event_num += trace_len  # 获取整个日志总的事件数量
        Event_num += trace_len
        start_time = None  # 要找出一条轨迹中的最早时间戳
        end_time = None  # 一条轨迹中的最晚时间戳
        for j in range(len(trace)):
            event = trace[j]
            activity = None
            try:
                activity = event["concept:name"]  # 通过该标签获取活动
            except:
                pass
            if activity is not None:
                Activity_set.add(activity)
            try:
                date_time = event["time:timestamp"]  # 通过该标签获取时间戳
            except:
                continue
            # 因为在读取xes时已经对时间戳进行排序了 所以第一个时间戳为开始时间 最后一个时间戳为结束时间
            if start_time is None: start_time = date_time
            end_time = date_time
        # 利用一条轨迹的开始时间和结束时间获取日志开始时间(最小的轨迹开始时间)和结束时间(最大的轨迹结束时间)
        if start_time is not None:
            if startDate is None:
                startDate = start_time
            else:
                if startDate > start_time:
                    startDate = start_time
        if end_time is not None:
            if endDate is None:
                endDate = end_time
            else:
                if endDate < end_time:
                    endDate = end_time
        # 利用一条轨迹的开始时间和结束时间获取轨迹持续时间
        try:
            Duration = end_time - start_time  # datetime类型的数据做加减后转为timedelta类型数据
        except:
            continue
        # 利用一条轨迹的持续时间计算轨迹最大、最小、平均持续时间
        if Case_Max_Duration is None:
            Case_Max_Duration = Duration
        else:
            if Case_Max_Duration < Duration:
                Case_Max_Duration = Duration
        if Case_Min_Duration is None:
            Case_Min_Duration = Duration
        else:
            if Case_Min_Duration > Duration:
                Case_Min_Duration = Duration
        if Case_mean_Duration is None:
            Case_mean_Duration = Duration
        else:
            Case_mean_Duration += Duration  # 获取整个日志的持续时间

    def timeRounding(date_time):  # 抹去timedelta时间类型数据的小数点
        data_time_ = datetime.timedelta(seconds=int(date_time.total_seconds()))
        return data_time_

    Activity_num = len(Activity_set)  # 活动集合的长度即为整个日志的活动数量
    # print(Activity_num)
    Case_mean_Duration /= Case_num  # Case内平均Event数量=日志所有Event数量/轨迹条数
    mean_Event_num = round(mean_Event_num / Case_num, 2)  # 保留两位小数
    # print(Event_num)
    # print(startDate)
    # print(endDate)
    # print(max_Event_num)
    # print(min_Event_num)
    # print(mean_Event_num)

    # 抹去timedelta时间类型数据的小数点
    Case_Max_Duration = timeRounding(Case_Max_Duration)
    Case_Min_Duration = timeRounding(Case_Min_Duration)
    Case_mean_Duration = timeRounding(Case_mean_Duration)

    # print(Case_Max_Duration)
    # print(Case_Min_Duration)
    # print(Case_mean_Duration)

    targets_dict = targets_dict.copy()
    targets_dict['Case数量'] = [Case_num]
    targets_dict['Activity数量'] = [Activity_num]
    targets_dict['Event数量'] = [Event_num]
    targets_dict['日志开始时间'] = [startDate]
    targets_dict['日志结束时间'] = [endDate]
    targets_dict['Case内最大Event数量'] = [max_Event_num]
    targets_dict['Case内最小Event数量'] = [min_Event_num]
    targets_dict['Case内平均Event数量'] = [mean_Event_num]
    targets_dict['Case最长持续时间'] = [Case_Max_Duration]
    targets_dict['Case最短持续时间'] = [Case_Min_Duration]
    targets_dict['Case平均持续时间'] = [Case_mean_Duration]
    return targets_dict  # 返回一个填充完整的字典

最后对多个数据集进行批量处理

def data_processing(origin_data_folder_address, originDataStatistic):  # 输入为一个数据集文件所在的文件夹地址,一个要填充的Dataframe
    file_list = os.listdir(origin_data_folder_address)  # 获取文件夹下的所有文件名称
    DataStatistic = [originDataStatistic.copy()]  # 转换为列表形式便于进行pd.concat操作
    # 整个过程可以理解为不断向文件末尾填充新数据
    for file_name in file_list:
        print(file_name)
        if ".xes" in file_name:
            targets_dict = dict()
            for target in targets:
                targets_dict[target] = None
            file_name_re = file_name.replace(".xes", "")  # 删除文件后缀名
            if file_name_re in list(DataStatistic[0]["数据集"]): continue  # 如果统计过这个数据集则跳过
            targets_dict['数据集'] = [file_name_re]  # 给字典中的['数据集']属性进行赋值,进入数据处理方法进行填充

            event_log = read_xes(origin_data_folder_address + file_name)  # 读取文件
            targets_dict = data_processing_event_log(event_log, targets_dict)  # 处理文件
            targets_Df = pd.DataFrame(targets_dict)  # 字典类型转为Dataframe
            DataStatistic.append(targets_Df)
        # 每处理一个文件就对文件进行更新防止异常终端程序
        DataStatistic = pd.concat(DataStatistic, axis=0)  # 将新的数据载入文件
        DataStatistic = DataStatistic.reset_index(drop=True)  # 重新设置序号
        DataStatistic.to_csv("./DataStatistic.csv")
        DataStatistic = [DataStatistic]  # 恢复文件数组
    res_Df = pd.concat(DataStatistic, axis=0)
    print(res_Df)

处理xes文件的整体代码如下

import os
import pandas as pd
import datetime

from pm4py.objects.log.importer.xes import importer as xes_importer

# 解决显示不完整的省略号
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

targets = [
    '数据集',
    'Case数量',
    'Activity数量',
    'Event数量',
    '日志开始时间',
    '日志结束时间',
    'Case内最大Event数量',
    'Case内最小Event数量',
    'Case内平均Event数量',
    'Case最长持续时间',
    'Case最短持续时间',
    'Case平均持续时间'
]


def read_xes(file_address):
    variant = xes_importer.Variants.ITERPARSE
    parameters = {variant.value.Parameters.TIMESTAMP_SORT: True}
    log = xes_importer.apply(file_address, variant=variant, parameters=parameters)
    return log


def data_processing_event_log(event_log, targets_dict):  # 输入为一个日志,一个带有['数据集']属性的字典(该方法不获取数据集名称,要在方法外设置好使用该方法填充)
    Case_num = len(event_log)  # 获取轨迹数量
    # print(Case_num)

    # 初始化
    Activity_set = set()  # 使用一个set来统计有多少种活动(set内数据不重复)
    Event_num = 0  # Event数量
    startDate = None  # 日志开始时间
    endDate = None  # 日志结束时间
    max_Event_num = 0  # Case内最大Event数量
    min_Event_num = 0x3f3f3f3f  # Case内最小Event数量
    mean_Event_num = 0  # Case内平均Event数量
    Case_Max_Duration = None  # Case最长持续时间
    Case_Min_Duration = None  # Case最短持续时间
    Case_mean_Duration = None  # Case平均持续时间
    for i in range(len(event_log)):
        trace = event_log[i]  # 单个轨迹
        trace_len = len(trace)  # 单个轨迹中的事件数量
        if max_Event_num < trace_len: max_Event_num = trace_len
        if min_Event_num > trace_len: min_Event_num = trace_len
        mean_Event_num += trace_len  # 获取整个日志总的事件数量
        Event_num += trace_len
        start_time = None  # 要找出一条轨迹中的最早时间戳
        end_time = None  # 一条轨迹中的最晚时间戳
        for j in range(len(trace)):
            event = trace[j]
            activity = None
            try:
                activity = event["concept:name"]  # 通过该标签获取活动
            except:
                pass
            if activity is not None:
                Activity_set.add(activity)
            try:
                date_time = event["time:timestamp"]  # 通过该标签获取时间戳
            except:
                continue
            # 因为在读取xes时已经对时间戳进行排序了 所以第一个时间戳为开始时间 最后一个时间戳为结束时间
            if start_time is None: start_time = date_time
            end_time = date_time
        # 利用一条轨迹的开始时间和结束时间获取日志开始时间(最小的轨迹开始时间)和结束时间(最大的轨迹结束时间)
        if start_time is not None:
            if startDate is None:
                startDate = start_time
            else:
                if startDate > start_time:
                    startDate = start_time
        if end_time is not None:
            if endDate is None:
                endDate = end_time
            else:
                if endDate < end_time:
                    endDate = end_time
        # 利用一条轨迹的开始时间和结束时间获取轨迹持续时间
        try:
            Duration = end_time - start_time  # datetime类型的数据做加减后转为timedelta类型数据
        except:
            continue
        # 利用一条轨迹的持续时间计算轨迹最大、最小、平均持续时间
        if Case_Max_Duration is None:
            Case_Max_Duration = Duration
        else:
            if Case_Max_Duration < Duration:
                Case_Max_Duration = Duration
        if Case_Min_Duration is None:
            Case_Min_Duration = Duration
        else:
            if Case_Min_Duration > Duration:
                Case_Min_Duration = Duration
        if Case_mean_Duration is None:
            Case_mean_Duration = Duration
        else:
            Case_mean_Duration += Duration  # 获取整个日志的持续时间

    def timeRounding(date_time):  # 抹去timedelta时间类型数据的小数点
        data_time_ = datetime.timedelta(seconds=int(date_time.total_seconds()))
        return data_time_

    Activity_num = len(Activity_set)  # 活动集合的长度即为整个日志的活动数量
    # print(Activity_num)
    Case_mean_Duration /= Case_num  # Case内平均Event数量=日志所有Event数量/轨迹条数
    mean_Event_num = round(mean_Event_num / Case_num, 2)  # 保留两位小数
    # print(Event_num)
    # print(startDate)
    # print(endDate)
    # print(max_Event_num)
    # print(min_Event_num)
    # print(mean_Event_num)

    # 抹去timedelta时间类型数据的小数点
    Case_Max_Duration = timeRounding(Case_Max_Duration)
    Case_Min_Duration = timeRounding(Case_Min_Duration)
    Case_mean_Duration = timeRounding(Case_mean_Duration)

    # print(Case_Max_Duration)
    # print(Case_Min_Duration)
    # print(Case_mean_Duration)

    targets_dict = targets_dict.copy()
    targets_dict['Case数量'] = [Case_num]
    targets_dict['Activity数量'] = [Activity_num]
    targets_dict['Event数量'] = [Event_num]
    targets_dict['日志开始时间'] = [startDate]
    targets_dict['日志结束时间'] = [endDate]
    targets_dict['Case内最大Event数量'] = [max_Event_num]
    targets_dict['Case内最小Event数量'] = [min_Event_num]
    targets_dict['Case内平均Event数量'] = [mean_Event_num]
    targets_dict['Case最长持续时间'] = [Case_Max_Duration]
    targets_dict['Case最短持续时间'] = [Case_Min_Duration]
    targets_dict['Case平均持续时间'] = [Case_mean_Duration]
    return targets_dict


def data_processing(origin_data_folder_address, originDataStatistic):  # 输入为一个数据集文件所在的文件夹地址,一个要填充的Dataframe
    file_list = os.listdir(origin_data_folder_address)  # 获取文件夹下的所有文件名称
    DataStatistic = [originDataStatistic.copy()]  # 转换为列表形式便于进行pd.concat操作
    # 整个过程可以理解为不断向文件末尾填充新数据
    for file_name in file_list:
        print(file_name)
        if ".xes" in file_name:
            targets_dict = dict()
            for target in targets:
                targets_dict[target] = None
            file_name_re = file_name.replace(".xes", "")  # 删除文件后缀名
            if file_name_re in list(DataStatistic[0]["数据集"]): continue  # 如果统计过这个数据集则跳过
            targets_dict['数据集'] = [file_name_re]  # 给字典中的['数据集']属性进行赋值,进入数据处理方法进行填充

            event_log = read_xes(origin_data_folder_address + file_name)  # 读取文件
            targets_dict = data_processing_event_log(event_log, targets_dict)  # 处理文件
            targets_Df = pd.DataFrame(targets_dict)  # 字典类型转为Dataframe
            DataStatistic.append(targets_Df)
        # 每处理一个文件就对文件进行更新防止异常终端程序
        DataStatistic = pd.concat(DataStatistic, axis=0)  # 将新的数据载入文件
        DataStatistic = DataStatistic.reset_index(drop=True)  # 重新设置序号
        DataStatistic.to_csv("./DataStatistic.csv")
        DataStatistic = [DataStatistic]  # 恢复文件数组
    res_Df = pd.concat(DataStatistic, axis=0)
    print(res_Df)


if __name__ == "__main__":
    origin_data_folder_address = "./data/origin_data/"  # 数据集文件夹地址
    # DataStatistic = pd.read_excel("./data/DataStatistic.xlsx")  # 表头文件
    # 刚开始肯定是对表头文件进行填充,后续有新文件输入时为了不重复统计则读取已经统计过的文件进行添加
    DataStatistic = pd.read_csv("./DataStatistic.csv", sep=",").drop("Unnamed: 0", axis=1)
    data_processing(origin_data_folder_address, DataStatistic)

    DataStatistic = pd.read_csv("./DataStatistic.csv", sep=",").drop("Unnamed: 0", axis=1)
    DataStatistic.to_excel("./DataStatistic.xlsx")

后续发现csv文件的数据格式极度不规范,则不再进行通用程序设计,但可以将其转换为xes数据形式,略微更改上面的代码进行处理。

处理csv文件时的几个关键点:

1、有许多nan的行、列,要对其进行删除

log_csv = pd.read_csv(file_directory_address + file_name, sep=",")
columns = log_csv.columns
for column in columns:
    if "Unnamed" in column:
        log_csv = log_csv.drop(column, axis=1)  # 对"Unnamed"的列进行删除
li = list(log_csv[columns[0]])
index = len(li)  # 初始化为文件长度即代表没有nan则不对行删除
# 找到第一个nan所在行(一般出现nan了则后续所有行都为nan)
for i in range(len(li)):
    if li[i] is np.nan:
        index = i
        break
log_csv = log_csv.drop(index=range(index, len(li)))  # 删除行号为[index, len(li)),即删从index到文件末尾的所有行

2、可以通过pm4py库将csv文件转换同xes文件的形式,需要将一列定为轨迹号列(自定义),从而让pm4py对其进行轨迹划分,后续即可以进行xes类似的数据处理操作。

parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'Case ID'}  # 'Case ID'是csv文件中的轨迹号列
event_log = log_converter.apply(log_csv, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

3、字符串转换为datetime,并添加时区

date_time = datetime.datetime.strptime(date_time, "%Y/%m/%d %H:%M:%S.000")
date_time = date_time.astimezone(datetime.timezone(datetime.timedelta(seconds=7200)))

4、改动部分

获取活动的数据名称

activity = event["Activity"]

获取时间戳的数据名称

date_time = event["Complete Timestamp"]

因为转换后的数据未对时间戳进行排序,所以要对轨迹开始时间和轨迹结束时间进行逐一比较获取

if start_time is None:
    start_time = date_time
else:
    if start_time > date_time:
        start_time = date_time
if end_time is None:
    end_time = date_time
else:
    if end_time < date_time:
        end_time = date_time

处理csv文件的整体代码如下

import pandas as pd
import numpy as np
import datetime
from pm4py.objects.conversion.log import converter as log_converter

# 解决显示不完整的省略号
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

file_directory_address = "./data/origin_data/"


def processing1():
    file_name = "Dataset belonging to the help desk log of an Italian Company.csv"
    log_csv = pd.read_csv(file_directory_address + file_name, sep=",")
    columns = log_csv.columns
    for column in columns:
        if "Unnamed" in column:
            log_csv = log_csv.drop(column, axis=1)
    li = list(log_csv[columns[0]])
    index = len(li)
    for i in range(len(li)):
        if li[i] is np.nan:
            index = i
            break
    log_csv = log_csv.drop(index=range(index, len(li)))

    parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'Case ID'}
    event_log = log_converter.apply(log_csv, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

    Case_num = len(event_log)
    print(Case_num)
    Activity_set = set()
    Event_num = 0
    max_Event_num = 0
    min_Event_num = 0x3f3f3f3f
    mean_Event_num = 0
    Case_Max_Duration = None
    Case_Min_Duration = None
    Case_mean_Duration = None
    startDate = None
    endDate = None
    for i in range(len(event_log)):
        trace = event_log[i]
        trace_len = len(trace)
        if max_Event_num < trace_len: max_Event_num = trace_len
        if min_Event_num > trace_len: min_Event_num = trace_len
        mean_Event_num += trace_len
        Event_num += trace_len
        start_time = None
        end_time = None
        for j in range(len(trace)):
            event = trace[j]
            activity = None
            try:
                activity = event["Activity"]
            except:
                pass
            if activity is not None:
                Activity_set.add(activity)
            try:
                date_time = event["Complete Timestamp"]
            except:
                continue
            date_time = datetime.datetime.strptime(date_time, "%Y/%m/%d %H:%M:%S.000")
            date_time = date_time.astimezone(datetime.timezone(datetime.timedelta(seconds=7200)))
            if start_time is None:
                start_time = date_time
            else:
                if start_time > date_time:
                    start_time = date_time
            if end_time is None:
                end_time = date_time
            else:
                if end_time < date_time:
                    end_time = date_time
        if start_time is not None:
            if startDate is None:
                startDate = start_time
            else:
                if startDate > start_time:
                    startDate = start_time
        if end_time is not None:
            if endDate is None:
                endDate = end_time
            else:
                if endDate < end_time:
                    endDate = end_time
        try:
            Duration = end_time - start_time
        except:
            continue
        if Case_Max_Duration is None:
            Case_Max_Duration = Duration
        else:
            if Case_Max_Duration < Duration:
                Case_Max_Duration = Duration
        if Case_Min_Duration is None:
            Case_Min_Duration = Duration
        else:
            if Case_Min_Duration > Duration:
                Case_Min_Duration = Duration
        if Case_mean_Duration is None:
            Case_mean_Duration = Duration
        else:
            Case_mean_Duration += Duration

    def timeRounding(date_time):
        data_time_ = datetime.timedelta(seconds=int(date_time.total_seconds()))
        return data_time_

    Activity_num = len(Activity_set)
    print(Activity_num)
    Case_mean_Duration /= Case_num
    # print("Case_mean_Duration.total_seconds:",Case_mean_Duration.total_seconds())
    # Case_mean_Duration = datetime.timedelta(seconds=int(Case_mean_Duration.total_seconds()))
    mean_Event_num = round(mean_Event_num / Case_num, 2)
    print(Event_num)
    # startDate = event_log[0].attributes["startDate"]
    print(startDate)
    print(endDate)
    # endDate = event_log[-1].attributes["endDate"]
    print(max_Event_num)
    print(min_Event_num)
    print(mean_Event_num)
    Case_Max_Duration = timeRounding(Case_Max_Duration)
    Case_Min_Duration = timeRounding(Case_Min_Duration)
    Case_mean_Duration = timeRounding(Case_mean_Duration)
    print(Case_Max_Duration)
    print(Case_Min_Duration)
    print(Case_mean_Duration)

    targets_dict = dict()
    file_name_re = file_name.replace(".csv", "")
    targets_dict['数据集'] = [file_name_re]
    targets_dict['Case数量'] = [Case_num]
    targets_dict['Activity数量'] = [Activity_num]
    targets_dict['Event数量'] = [Event_num]
    targets_dict['日志开始时间'] = [startDate]
    targets_dict['日志结束时间'] = [endDate]
    targets_dict['Case内最大Event数量'] = [max_Event_num]
    targets_dict['Case内最小Event数量'] = [min_Event_num]
    targets_dict['Case内平均Event数量'] = [mean_Event_num]
    targets_dict['Case最长持续时间'] = [Case_Max_Duration]
    targets_dict['Case最短持续时间'] = [Case_Min_Duration]
    targets_dict['Case平均持续时间'] = [Case_mean_Duration]
    targets_Df = pd.DataFrame(targets_dict)
    DataStatistic = pd.read_csv("./DataStatistic.csv", sep=",").drop("Unnamed: 0", axis=1)
    if file_name_re in list(DataStatistic["数据集"]): return
    DataStatistic = pd.concat([DataStatistic, targets_Df], axis=0).reset_index(drop=True)
    DataStatistic.to_csv("./DataStatistic.csv")
    DataStatistic = pd.read_csv("./DataStatistic.csv", sep=",").drop("Unnamed: 0", axis=1)
    DataStatistic.to_excel("./DataStatistic.xlsx")


if __name__ == "__main__":
    processing1()

附:统计所用所有数据集下载地址

Event Data - Process Mining

Environmental permit application process (‘WABO’), CoSeLoG project – Municipality 5

NASA Crew Exploration Vehicle (CEV) Software Event Log

Dataset belonging to the help desk log of an Italian Company

Hospital Billing - Event Log

Road Traffic Fine Management Process

举报

相关推荐

0 条评论