import json
import re
from collections import defaultdict, UserDict, UserList
from dataclasses import dataclass, field
from pathlib import Path
from typing import Union, Any
import pandas as pd
import xlwings as xw
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox
FileOrName = Union[Path, str]
desktop = Path.home() / 'Desktop'
def desktop_fd():
"""Desktop folder"""
dst_dir_fd = desktop / 'Result'
dst_dir_fd.mkdir(exist_ok=True)
return dst_dir_fd
def auto_excel(fun):
def wrapper(*args, **kwargs):
fullname = fun(*args, **kwargs)
output = fullname.with_suffix('.xlsx')
df = pd.read_json(fullname).convert_dtypes().T.fillna('')
with xw.App(visible=False, add_book=True) as app:
app.display_alerts = False
app.screen_updating = False
wb = app.books.active
sht = wb.sheets.active
sht.range("C:C").api.NumberFormat = "@"
sht.range("H:H").api.NumberFormat = "@"
sht.range("I:I").api.NumberFormat = "@"
sht.range('A1').value = df
sht.autofit()
rng = sht.range('A1').expand('right')
rng.color = (128, 128, 128)
rng.row_height = 15
rng.font.color = (255, 255, 255)
rng.font.bold = True
rng.api.HorizontalAlignment = -4108
rng.api.VerticalAlignment = -4130
wb.save(output)
wb.close()
return fullname
return wrapper
@dataclass
class FileDictLists(UserDict):
"""文件字典列表"""
dir_path: str = None
_dst: Path = field(default_factory=desktop_fd)
def __post_init__(self):
super().__init__()
if self.dir_path is None:
self.path = Path(__file__).parent.resolve()
else:
self.path = Path(self.dir_path).parent.resolve()
target_data = defaultdict(list)
suffix = '*.pdf'
for file in self.path.rglob(suffix):
target_data[file.parent].append(file)
self.data.update(target_data)
@property
def dirs(self):
dirs = (dirs for dirs in self.data.keys())
return dirs
@property
def filenames(self):
files = (files.as_posix() for dir_path in self.dirs for files in dir_path.glob('*.pdf'))
return files
@property
def relative_path_filenames(self):
"""Relative path file name"""
series = pd.Series(list(self.filenames))
file_lists = series.str.replace(self.path.as_posix(), '.', regex=False).to_list()
return file_lists
def dump_json(self):
file_lists = self.relative_path_filenames
data = dict.fromkeys(file_lists)
_json = self._dst / 'files.json'
with open(_json, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False, separators=(',', ':'))
@dataclass
class RegEx:
"""Regular expression configuration
InvoiceNumber(发票号码):长度8位,($)必须需要,否则会匹配发票代码前8位
InvoiceCode(发票代码):长度位12 位数,最后一位数为0, 没有机器编号 --> 区域链电子发票
长度为12,第一位数:0,最后2位:11 --> 普通电票
TaxPayerNum(纳税人识别号):(91或92)开头,长度:18位,格式:纯数字或数字&大写字母组合
"""
InvoiceNumber = re.compile(r'\d{8}$')
InvoiceCode = re.compile(r'^[01]\d{9}(11|\d0)$')
BillingDate_0 = re.compile(r'(20)\d{2}年\d{1,2}月\d{1,2}日')
BillingDate_1 = re.compile(r'^(20)\d{2}\d*$')
TaxRate = re.compile(r'13%|9%|6%|5%|3%|1%|0%|免税|不征税|\*{3}$')
TotalTaxSum = re.compile(r'(\d+)\.\d{2}')
TaxPayerNum = re.compile(r'^(91|92)[A-Z\d]{16}', flags=re.A)
class ContentList(UserList):
regex = RegEx()
def __iter__(self):
for content in self.data:
yield re.sub(r'\s+', '', content)
def _get_invoice_code(self):
for text in self:
if content := self.regex.InvoiceCode.match(text):
return content.group()
def _get_invoice_number(self):
for text in self:
if content := self.regex.InvoiceNumber.match(text):
return content.group()
def _get_bill_date(self):
for text in self:
if content := self.regex.BillingDate_0.match(text) \
or self.regex.BillingDate_1.match(text):
return content.group()
def _get_tax_rate(self):
for text in self:
if content := self.regex.TaxRate.match(text):
return content.group()
def _get_tts(self):
"""Total Tax Sum"""
array = []
for text in self:
if content := self.regex.TotalTaxSum.search(text):
array.append(content.group())
if not array:
return
else:
return array[-1]
def _get_tpn(self):
"""Tax Payer Num"""
data_list = []
data_dict = dict.fromkeys(['taxpayerNumber', 'salesTaxpayerNum'])
for text in self:
if content := self.regex.TaxPayerNum.match(text):
data_list.append(content.group())
if len(data_list) == 2:
break
if not data_list:
return data_dict
if (jojo := len(data_list)) == 1:
data_dict['salesTaxpayerNum'] = data_list[-1]
if jojo == 2:
data_dict['taxpayerNumber'] = data_list[0]
data_dict['salesTaxpayerNum'] = data_list[-1]
return data_dict
def dump_dict(self):
if self.data:
invoice_code = self._get_invoice_code()
invoice_number = self._get_invoice_number()
bill_time = self._get_bill_date()
tax_rate = self._get_tax_rate()
total_tax_sum = self._get_tts()
taxpayer_num = self._get_tpn()
fp_data = dict(
anchor=True,
InvoiceCode=invoice_code,
InvoiceNumber=invoice_number,
BillTime=bill_time,
TaxRate=tax_rate,
TotalTaxSum=total_tax_sum,
TaxPayerNum=taxpayer_num.get('taxpayerNumber'),
SalesTaxPayerNum=taxpayer_num.get('salesTaxpayerNum')
)
return fp_data
return dict(anchor=False)
@dataclass
class EInvoice:
"""电子发票 : e-invoice"""
pdf_file: FileOrName
data: Any = field(default_factory=ContentList)
def _extract_content(self):
for page_layout in extract_pages(pdf_file=self.pdf_file, page_numbers=[0]):
for element in page_layout:
if isinstance(element, LTTextBox):
if text := filter(lambda x: len(x) > 2, element):
for content in text:
self.data.append(content.get_text().strip())
if not self.data:
print(f'{self.pdf_file} 文件无法解析。')
break
def converter_dict(self):
self._extract_content()
retval = {}
d = self.data.dump_dict()
retval[f'{self.pdf_file}'] = d
return retval
def load():
fullname = desktop / 'Result/files.json'
if not Path(fullname).exists():
FileDictLists().dump_json()
with open(fullname, 'rb') as f:
return json.load(f)
@auto_excel
def dump2json(obj: dict):
retval = obj
for file in obj.keys():
content = EInvoice(file)
d = content.converter_dict()
retval.update(d)
_json = desktop / 'Result/data.json'
with open(_json, 'w', encoding='utf-8') as f:
json.dump(retval, f, indent=4, ensure_ascii=False, separators=(',', ':'))
return _json
if __name__ == '__main__':
in_put = load()
engine = dump2json(in_put)