https://zhuanlan.zhihu.com/p/381123453
自己理解的代码
# -*- coding:utf-8 -*-
import numpy as np
def char_zi_str(resf,n):
resf_list = []
for step in range(1, n + 1):
resf_one_list=[]
for i in range(0, len(resf) + 1 - step):
resf_one_list.append(resf[i:i+step])
resf_list.append(resf_one_list)
return resf_list
def Bleu(pre,resf,n):
resf_l=char_zi_str(resf,n)
pre_l=char_zi_str(pre,n)
p_list=[]
for pre_d,resf_d in zip(pre_l,resf_l):
p=len(set(resf_d)&set(pre_d))/len(resf_d)
p_list.append(p)
bp=len(resf)/len(pre)
p_list=np.array(p_list)
logbelu=min([0,1-bp])+np.log(p_list[~(p_list==0)])/p_list.size
if logbelu.size!=0:
return np.exp(logbelu)[0]
else:
return 0
if __name__ == "__main__":
bleu = Bleu("小女不知不知", "儿女不知来避地,强言风物胜江南。",4)
print(bleu)
other版
# -*- coding:utf-8 -*-
import numpy as np
from collections import Counter
import pandas as pd
def char_zi_str(resf,n):
resf_list = []
for step in range(1, n + 1):
resf_one_list=[]
for i in range(0, len(resf) + 1 - step):
resf_one_list.append(resf[i:i+step])
resf_list.append(resf_one_list)
return resf_list
def Belu(pre,resf,n):
resf_l=char_zi_str(resf,n)
pre_l=char_zi_str(pre,n)
p_list=[]
for pre_d,resf_d in zip(pre_l,resf_l):
pre_counter = Counter(pre_d)
pre_df=pd.DataFrame({"key":pre_counter.keys(),"pre_count":pre_counter.values()})
resf_counter= Counter(resf_d)
resf_df = pd.DataFrame({"key": resf_counter.keys(), "resf_count": resf_counter.values()})
p_r_df=pd.merge(pre_df,resf_df,how="left")
p_r_df[pd.isnull(p_r_df["resf_count"])]=0
p=p_r_df["resf_count"].sum()/(p_r_df["pre_count"].sum()+0.0000000000000000001)
p_list.append(p)
bp=len(resf)/len(pre)
p_list=np.array(p_list)
logbelu=min([0,1-bp])+np.log(p_list[~(p_list==0)])/p_list.size
if logbelu.size!=0:
return np.exp(logbelu)[0]
else:
return 0
if __name__ == "__main__":
bleu = Belu("儿女不知来避地,强言风物胜江南。", "儿女不知来避地,强言风物胜江南。",4)
print(bleu)
网友代码
https://zhuanlan.zhihu.com/p/223048748
def bleu(references,candidate, n):
return modified_precision(references,candidate, n)
def modified_precision(references, candidate, n):
"""
In the modified n-gram precision, a reference word will be considered
exhausted after a matching candidate word is identified, e.g.
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will',
... 'forever', 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the',
... 'Party']
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> candidate= 'of the'.split()
>>> references = [reference1, reference2, reference3]
>>> float(modified_precision(references, candidate, n=1))
1.0
>>> float(modified_precision(references, candidate, n=2))
1.0
:param references: A list of reference translations.
:type references: list(list(str))
:param hypothesis: A hypothesis translation.
:type hypothesis: list(str)
:param n: The ngram order.
:type n: int
:return: BLEU's modified precision for the nth order ngram.
:rtype: Fraction
"""
# Extracts all ngrams in hypothesis
# Set an empty Counter if hypothesis is empty.
counts = Counter(ngrams(candidate, n)) if len(candidate) >= n else Counter()
# Extract a union of references' counts.
# max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
max_counts = {}
for reference in references:
reference_counts = (
Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
)
for ngram in counts:
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
# Assigns the intersection between hypothesis and references' counts.
clipped_counts = {
ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
}
numerator = sum(clipped_counts.values())
# Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
# Usually this happens when the ngram order is > len(reference).
denominator = max(1, sum(counts.values()))
return numerator/denominator