中文句子:我喜欢看电影和读书。
分词结果:我 | 喜 | 欢 | 看 | 电 | 影 | 和 | 读 | 书 | 。
英文句子:I enjoy watching movies and reading books.
分词结果:I | | e | n | j | o | y | | w | a | t | c | h | i | n | g | | m | o | v | i | e | s | | a | n | d | | r | e | a | d | i | n | g | | b | o | o | k | s | .
from collections import defaultdict
sentences = [
"我",
"喜欢",
"吃",
"苹果",
"他",
"不",
"喜欢",
"吃",
"苹果派",
"I like to eat apples",
"She has a cute cat",
"you are very cute",
"give you a hug",
]
# 构建频率统计defbuild_stats(sentences):
stats = defaultdict(int)
for sentence in sentences:
symbols = sentence.split()
for symbol in symbols:
stats[symbol] += 1return stats
stats = build_stats(sentences)
print("stats:", stats)
alphabet = []
for word in stats.keys():
if word[0] notin alphabet:
alphabet.append(word[0])
for letter in word[1:]:
iff"##{letter}"notin alphabet:
alphabet.append(f"##{letter}")
alphabet.sort()
# 初始词表
vocab = alphabet.copy()
print("alphabet:", alphabet)
根据初始词表拆分每个词:
splits = {
word: [c if i == 0elsef"##{c}"for i, c inenumerate(word)]
for word in stats.keys()
}
print("splits:", splits)
根据上述提到的计算互信息的分数公式进行计算:
defcompute_pair_scores(splits):
letter_freqs = defaultdict(int)
pair_freqs = defaultdict(int)
for word, freq in stats.items():
split = splits[word]
iflen(split) == 1:
letter_freqs[split[0]] += freq
continuefor i inrange(len(split) - 1):
pair = (split[i], split[i + 1])
letter_freqs[split[i]] += freq
pair_freqs[pair] += freq
letter_freqs[split[-1]] += freq
scores = {
pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
for pair, freq in pair_freqs.items()
}
return scores
pair_scores = compute_pair_scores(splits)
for i, key inenumerate(pair_scores.keys()):
print(f"{key}: {pair_scores[key]}")
if i >= 5:
break
defmerge_pair(a, b, splits):
for word in stats:
split = splits[word]
iflen(split) == 1:
continue
i = 0while i < len(split) - 1:
if split[i] == a and split[i + 1] == b:
merge = a + b[2:] if b.startswith("##") else a + b
split = split[:i] + [merge] + split[i + 2 :]
else:
i += 1
splits[word] = split
return splits
sentences = [
"我",
"喜欢",
"吃",
"苹果",
"他",
"不",
"喜欢",
"吃",
"苹果派",
"I like to eat apples",
"She has a cute cat",
"you are very cute",
"give you a hug",
]
# 构建频率统计defbuild_stats(sentences):
stats = defaultdict(int)
for sentence in sentences:
symbols = sentence.split()
for symbol in symbols:
stats[symbol] += 1return stats
stats = build_stats(sentences)
print("stats:", stats)
alphabet = []
for word in stats.keys():
for letter in word:
if letter notin alphabet:
alphabet.append(letter)
alphabet.sort()
# 初始词表
vocab = alphabet.copy()
print("alphabet:", alphabet)
根据初始词表拆分每个词,计算左右 pair(子词对) 出现的频率:
splits = {word: [c for c in word] for word in stats.keys()}
print("splits:", splits)
defcompute_pair_freqs(splits):
pair_freqs = defaultdict(int)
for word, freq in stats.items():
split = splits[word]
iflen(split) == 1:
continuefor i inrange(len(split) - 1):
pair = (split[i], split[i + 1])
pair_freqs[pair] += freq
return pair_freqs
pair_freqs = compute_pair_freqs(splits)
for i, key inenumerate(pair_freqs.keys()):
print(f"{key}: {pair_freqs[key]}")
if i >= 5:
break