import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
# 加载数据集try:
df = pd.read_csv('IMDB Dataset.csv')
except FileNotFoundError:
# 模拟数据用于演示
df = pd.DataFrame({'review': ['I love this movie', 'This is terrible'],
'sentiment': ['positive', 'negative']})
# 文本清洗defclean_text(text):
text = re.sub(r'<.*?>', '', str(text))
text = re.sub(r'[^a-zA-Z\s]', '', str(text))
return text.lower()
df['clean_review'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
df['clean_review'], df['label'], test_size=0.2, random_state=42
)
3.2 词汇表与编码
构建简单的词袋模型索引。
from collections import Counter
vocab = Counter()
for text in X_train:
vocab.update(text.split())
vocab_size = min(len(vocab), 5000)
vocab_list = [w for w, _ in vocab.most_common(vocab_size)]
word2idx = {w: i + 1for i, w inenumerate(vocab_list)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = len(word2idx)
deftext_to_sequence(text, max_len=100):
seq = [word2idx.get(w, word2idx['<UNK>']) for w in text.split()]
iflen(seq) < max_len:
seq += [0] * (max_len - len(seq))
else:
seq = seq[:max_len]
return np.array(seq)
X_train_seq = np.array([text_to_sequence(t) for t in X_train])
X_test_seq = np.array([text_to_sequence(t) for t in X_test])