Bleu计算和解析-CFANZ编程社区

https://zhuanlan.zhihu.com/p/381123453

自己理解的代码

# -*- coding:utf-8 -*-
import numpy as np

def char_zi_str(resf,n):
    resf_list = []
    for step in range(1, n + 1):
        resf_one_list=[]
        for i in range(0, len(resf) + 1 - step):
            resf_one_list.append(resf[i:i+step])
        resf_list.append(resf_one_list)
    return resf_list
def Bleu(pre,resf,n):

    resf_l=char_zi_str(resf,n)
    pre_l=char_zi_str(pre,n)
    p_list=[]
    for pre_d,resf_d  in zip(pre_l,resf_l):
        p=len(set(resf_d)&set(pre_d))/len(resf_d)
        p_list.append(p)

    bp=len(resf)/len(pre)
    p_list=np.array(p_list)
    logbelu=min([0,1-bp])+np.log(p_list[~(p_list==0)])/p_list.size
    if logbelu.size!=0:
        return np.exp(logbelu)[0]
    else:
        return 0

if __name__ == "__main__":

    bleu = Bleu("小女不知不知", "儿女不知来避地，强言风物胜江南。",4)
    print(bleu)

other版

# -*- coding:utf-8 -*-
import numpy as np
from  collections import  Counter
import pandas as pd
def char_zi_str(resf,n):
    resf_list = []
    for step in range(1, n + 1):
        resf_one_list=[]
        for i in range(0, len(resf) + 1 - step):
            resf_one_list.append(resf[i:i+step])
        resf_list.append(resf_one_list)
    return resf_list

def Belu(pre,resf,n):

    resf_l=char_zi_str(resf,n)
    pre_l=char_zi_str(pre,n)
    p_list=[]
    for pre_d,resf_d  in zip(pre_l,resf_l):
        pre_counter = Counter(pre_d)
        pre_df=pd.DataFrame({"key":pre_counter.keys(),"pre_count":pre_counter.values()})
        resf_counter= Counter(resf_d)
        resf_df = pd.DataFrame({"key": resf_counter.keys(), "resf_count": resf_counter.values()})
        p_r_df=pd.merge(pre_df,resf_df,how="left")
        p_r_df[pd.isnull(p_r_df["resf_count"])]=0
        p=p_r_df["resf_count"].sum()/(p_r_df["pre_count"].sum()+0.0000000000000000001)
        p_list.append(p)

    bp=len(resf)/len(pre)
    p_list=np.array(p_list)
    logbelu=min([0,1-bp])+np.log(p_list[~(p_list==0)])/p_list.size
    if logbelu.size!=0:
        return np.exp(logbelu)[0]
    else:
        return 0

if __name__ == "__main__":

    bleu = Belu("儿女不知来避地,强言风物胜江南。", "儿女不知来避地，强言风物胜江南。",4)
    print(bleu)

网友代码
https://zhuanlan.zhihu.com/p/223048748

def bleu(references,candidate, n):
    return modified_precision(references,candidate, n)
def modified_precision(references, candidate, n):
    """
    In the modified n-gram precision, a reference word will be considered
    exhausted after a matching candidate word is identified, e.g.

        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
        ...               'ensures', 'that', 'the', 'military', 'will',
        ...               'forever', 'heed', 'Party', 'commands']
        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
        ...               'guarantees', 'the', 'military', 'forces', 'always',
        ...               'being', 'under', 'the', 'command', 'of', 'the',
        ...               'Party']
        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
        ...               'of', 'the', 'party']
        >>> candidate= 'of the'.split()
        >>> references = [reference1, reference2, reference3]
        >>> float(modified_precision(references, candidate, n=1))
        1.0
        >>> float(modified_precision(references, candidate, n=2))
        1.0

    :param references: A list of reference translations.
    :type references: list(list(str))
    :param hypothesis: A hypothesis translation.
    :type hypothesis: list(str)
    :param n: The ngram order.
    :type n: int
    :return: BLEU's modified precision for the nth order ngram.
    :rtype: Fraction
    """
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.
    counts = Counter(ngrams(candidate, n)) if len(candidate) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference in references:
        reference_counts = (
            Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
        )
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    # Assigns the intersection between hypothesis and references' counts.
    clipped_counts = {
        ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
    }

    numerator = sum(clipped_counts.values())
    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
    # Usually this happens when the ngram order is > len(reference).
    denominator = max(1, sum(counts.values()))

    return numerator/denominator