LLM 应用开发实战：构建智能搜索与推荐引擎

	电影 1	电影 2	电影 3	电影 4
用户 1	4	-	5	-
用户 2	-	3	-	2
用户 3	5	4	-	3

LLM 应用开发实战：构建智能搜索与推荐引擎 | 极客日志

import numpy as np
# 你的用户 - 电影评分矩阵（用实际数据替换）
user_movie_matrix = np.array([
    [4, 0, 5, 0],
    [0, 3, 0, 2],
    [5, 4, 0, 3]
])
# 应用 SVD
U, s, V = np.linalg.svd(user_movie_matrix, full_matrices=False)
# 潜在因子的数量（可以根据需要选择）
num_latent_factors = 2
# 使用选定的潜在因子重构原始矩阵
reconstructed_matrix = U[:, :num_latent_factors] @ np.diag(s[:num_latent_factors]) @ V[:num_latent_factors, :]
# 将负值替换为 0
reconstructed_matrix = np.maximum(reconstructed_matrix, 0)
print("重构矩阵:")
print(reconstructed_matrix)

重构矩阵:
[[4.2972542  0.         4.71897811 0.        ]
 [1.08572801 2.27604748 0.         1.64449028]
 [4.44777253 4.36821972 0.52207171 3.18082082]]

**注意**  
一个推荐系统 LLM 的示例是 P5，由 Shijie Gang 等人在论文《推荐作为语言处理（RLP）：统一的预训练、个性化提示与预测范式》（Recommendation as Language Processing (RLP): A Unified Pretrain, Personalized Prompt & Predict Paradigm (P5)）中提出。  
P5 是一个使用大语言模型（LLM）构建推荐系统的统一文本到文本范式。它包含三个步骤：

1. **预训练**：基于 T5 架构的基础语言模型在大规模网络语料库上进行预训练，并在推荐任务上进行微调。
2. **个性化提示**：根据用户的行为数据和上下文特征，为每个用户生成个性化提示。
3. **预测**：将个性化提示输入到预训练的语言模型中生成推荐。

P5 基于这样的理念，即 LLM 可以编码广泛的世界知识和用户偏好，并可以通过零样本或少量样本适应不同的推荐任务。

import pandas as pd
import ast
# 将字典的字符串表示转换为实际的字典
md['genres'] = md['genres'].apply(ast.literal_eval)
# 转换 'genres' 列
md['genres'] = md['genres'].apply(lambda x: [genre['name'] for genre in x])

# 计算加权评分 (IMDb 公式)
def calculate_weighted_rate(vote_average, vote_count, min_vote_count=10):
    return (vote_count / (vote_count + min_vote_count)) * vote_average + (min_vote_count / (vote_count + min_vote_count)) * 5.0

# 防止结果偏斜的最低投票数
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
min_vote_count = vote_counts.quantile(0.95)

# 创建新列 'weighted_rate'
md['weighted_rate'] = md.apply(lambda row: calculate_weighted_rate(row['vote_average'], row['vote_count'], min_vote_count), axis=1)

md_final['combined_info'] = md_final.apply(lambda row: f"Title: {row['title']}. Overview: {row['overview']} Genres: {', '.join(row['genres'])}. Rating: {row['weighted_rate']}", axis=1).astype(str)

import pandas as pd
import tiktoken
import os
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]
from openai.embeddings_utils import get_embedding

embedding_encoding = "cl100k_base" # text-embedding-ada-002 的编码
max_tokens = 8000 # text-embedding-ada-002 的最大值是 8191
encoding = tiktoken.get_encoding(embedding_encoding)

# 忽略过长的文本
md_final["n_tokens"] = md_final.combined_info.apply(lambda x: len(encoding.encode(x)))
md_final = md_final[md_final.n_tokens <= max_tokens]

md_final["embedding"] = md_final.overview.apply(lambda x: get_embedding(x, engine=embedding_model))

md['text'][0]

'Title: GoldenEye. Overview: James Bond must unmask the mysterious head of the Janus Syndicate and prevent the leader from utilizing the GoldenEye weapons system to inflict devastating revenge on Britain. Genres: Adventure, Action, Thriller. Rating: 6.173464373464373'

md_final.rename(columns={'embedding': 'vector'}, inplace=True)
md_final.rename(columns={'combined_info': 'text'}, inplace=True)
md_final.to_pickle('movies.pkl')

pip install lancedb

import lancedb

uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("movies", md)

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import LanceDB
import os

os.environ["OPENAI_API_KEY"]
embeddings = OpenAIEmbeddings()
docsearch = LanceDB(connection=table, embedding=embeddings)

query = "I'm looking for an animated action movie. What could you suggest to me?"
docs = docsearch.similarity_search(query)
docs

[Document(page_content='Title: Hitman: Agent 47. Overview: An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry. Genres: Action, Crime, Thriller. Rating: 5.365800865800866', metadata={'genres': array(['Action', 'Crime', 'Thriller'], dtype=object), 'title': 'Hitman: Agent 47', 'overview': 'An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry.', 'weighted_rate': 5.365800865800866, 'n_tokens': 52, 'vector': array([-0.00566491, -0.01658553, ……])

qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)
query = "I'm looking for an animated action movie. What could you suggest to me?"
result = qa({"query": query})
result['result']

' I would suggest Transformers. It is an animated action movie with genres of Adventure, Science Fiction, and Action, and a rating of 6.'

result['source_documents'][0]

Document(page_content='Title: Hitman: Agent 47. Overview: An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry. Genres: Action, Crime, Thriller. Rating: 5.365800865800866', metadata={'genres': array(['Action', 'Crime', 'Thriller'], dtype=object), 'title': 'Hitman: Agent 47', 'overview': 'An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry.', 'weighted_rate': 5.365800865800866, 'n_tokens': 52, 'vector': array([-0.00566491, -0.01658553, -0.02255735, ..., -0.01242317, -0.01303058, -0.00709073], dtype=float32), '_distance': 0.42414575815200806})

df_filtered = md[md['genres'].apply(lambda x: 'Comedy' in x)]
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'data': df_filtered}), return_source_documents=True)
query = "I'm looking for a movie with animals and an adventurous plot."
result = qa({"query": query})

qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'filter': {'weighted_rate__gt': 7}}), return_source_documents=True)

from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0)
retriever = docsearch.as_retriever(return_source_documents=True)
tool = create_retriever_tool(
    retriever,
    "movies",
    "Searches and returns recommendations about movies."
)
tools = [tool]
agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True)
result = agent_executor({"input": "suggest me some action movies"})

> Entering new AgentExecutor chain...
Invoking: `movies` with `{'genre': 'action'}`
[Document(page_content='The action continues from [REC], ……]
Here are some action movies that you might enjoy:
1. [REC]² - The action continues from [REC], with a medical officer and a SWAT team sent into a sealed-off apartment to control the situation. It is a thriller/horror movie.
2. The Boondock Saints - Twin brothers Conner and Murphy take swift retribution into their hands to rid Boston of criminals. It is an action/thriller/crime movie.
3. The Gamers - Four clueless players are sent on a quest to rescue a princess and must navigate dangerous forests, ancient ruins, and more. It is an action/comedy/thriller/foreign movie.
4. Atlas Shrugged Part III: Who is John Galt? - In a collapsing economy, one man has the answer while others try to control or save him. It is a drama/science fiction/mystery movie.
Please note that these recommendations are based on the genre "action" and may vary in terms of availability and personal preferences.
> Finished chain.

print(qa.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:

from langchain.prompts import PromptTemplate

template = """You are a movie recommender system that help users to find movies that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three movies, with a short description of the plot and the reason why the user might like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Your response:"""
 
PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(),
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)
query = "I'm looking for a funny action movie, any suggestion?"
result = qa({'query':query})
print(result['result'])

1. A Good Day to Die Hard: An action-packed comedy directed by John Moore, this movie follows Iconoclastic, take-no-prisoners cop John McClane as he travels to Moscow to help his wayward son Jack. With the Russian underworld in pursuit, and battling a countdown to war, the two McClanes discover that their opposing methods make them unstoppable heroes.
2. The Hidden: An alien is on the run in America and uses the bodies of anyone in its way as a hiding place. With lots of innocent people dying in the chase, this action-packed horror movie is sure to keep you laughing.
3. District B13: Set in the ghettos of Paris in 2010, this action-packed science fiction movie follows an undercover cop and ex-thug as they try to infiltrate a gang in order to defuse a neutron bomb. A thrilling comedy that will keep you laughing.

from langchain.prompts import PromptTemplate

template_prefix = """You are a movie recommender system that help users to find movies that match their preferences.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix = """Question: {question}
Your response:"""

user_info = user_info.format(age=18, gender='female')
COMBINED_PROMPT = template_prefix + '\n' + user_info + '\n' + template_suffix
print(COMBINED_PROMPT)

You are a movie recommender system that help users to find movies that match their preferences.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 18
Gender: female
Question: {question}
Your response:

PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(),
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)
result = qa({'query': query})
result['result']

' Sure, I can suggest some action movies for you. Here are a few examples: A Good Day to Die Hard, Goldfinger, Ong Bak 2, and The Raid 2. All of these movies have high ratings and feature thrilling action elements. I hope you find something that you enjoy!'

import pandas as pd
data = {
    "username": ["Alice", "Bob"],
    "age": [25, 32],
    "gender": ["F", "M"],
    "movies": [
        [("Transformers: The Last Knight", 7), ("Pokémon: Spell of the Unknown", 5)],
        [("Bon Cop Bad Cop 2", 8), ("Goon: Last of the Enforcers", 9)]
    ]
}
# 将 "movies" 列转换为字典
for i, row_movies in enumerate(data["movies"]):
    movie_dict = {}
    for movie, rating in row_movies:
        movie_dict[movie] = rating
    data["movies"][i] = movie_dict
# 创建一个 pandas DataFrame
df = pd.DataFrame(data)
df.head()

template_prefix = """You are a movie recommender system that help users to find movies that match their preferences.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}"""
user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}
Movies already seen alongside with rating: {movies}"""
template_suffix= """Question: {question}
Your response:"""

age = df.loc[df['username']=='Alice']['age'][0]
gender = df.loc[df['username']=='Alice']['gender'][0]
movies = ''
# 遍历字典并输出电影名称和评分
for movie, rating in df['movies'][0].items():
    output_string = f"Movie: {movie}, Rating: {rating}" + "\n"
    movies += output_string
user_info = user_info.format(age=age, gender=gender, movies=movies)
COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a movie recommender system that help users to find movies that match their preferences.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 25
Gender: F
Movies already seen alongside with rating: Movie: Transformers: The Last Knight, Rating: 7
Movie: Pokémon: Spell of the Unknown, Rating: 5
Question: {question}
Your response:

PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(),
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)
query = "Can you suggest me some action movie based on my background?"
result = qa({'query': query})
result['result']

" Based on your age, gender, and the movies you've already seen, I would suggest the following action movies: The Raid 2 (Action, Crime, Thriller; Rating: 6.71), Ong Bak 2 (Adventure, Action, Thriller; Rating: 5.24), Hitman: Agent 47 (Action, Crime, Thriller; Rating: 5.37), and Kingsman: The Secret Service (Crime, Comedy, Action, Adventure; Rating: 7.43)."

import streamlit as st
st.set_page_config(page_title="MovieHarbor", page_icon="")
st.header(' Welcome to MovieHarbor, your favourite movie recommender')

load_dotenv()
openai_api_key = os.environ['OPENAI_API_KEY']
embeddings = OpenAIEmbeddings()
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.open_table('movies')
docsearch = LanceDB(connection=table, embedding=embeddings)

# 导入电影数据集
md = pd.read_pickle('movies.pkl')

# 为用户输入创建侧边栏
st.sidebar.title("Movie Recommendation System")
st.sidebar.markdown("Please enter your details and preferences below:")

# 询问用户年龄、性别和最喜欢的电影类型
age = st.sidebar.slider("What is your age?", 1, 100, 25)
gender = st.sidebar.radio("What is your gender?", ("Male", "Female", "Other"))
genre = st.sidebar.selectbox("What is your favourite movie genre?", md.explode('genres')["genres"].unique())

# 根据用户输入过滤电影
df_filtered = md[md['genres'].apply(lambda x: genre in x)]

template_prefix = """You are a movie recommender system that helps users to find movies that match their preferences.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix = """Question: {question}
Your response:"""

user_info = user_info.format(age=age, gender=gender)
COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

# 设置链
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'data': df_filtered}), return_source_documents=True)

query = st.text_input('Enter your question:', placeholder='What action movies do you suggest?')
if query:
    result = qa({"query": query})
    st.write(result['result'])

LLM 应用开发实战：构建智能搜索与推荐引擎

技术要求

推荐系统简介

现有的推荐系统

K 近邻算法

更多推荐文章

相关免费在线工具

矩阵分解

神经网络

LLMs 如何改变推荐系统

实现一个基于 LLM 的推荐系统

数据预处理

在冷启动场景中构建一个 QA 推荐聊天机器人

构建基于内容的系统

使用 Streamlit 开发前端

总结

更多推荐文章

相关免费在线工具

LLM 应用开发实战：构建智能搜索与推荐引擎

技术要求

推荐系统简介

现有的推荐系统

K 近邻算法

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

矩阵分解

神经网络

LLMs 如何改变推荐系统

实现一个基于 LLM 的推荐系统

数据预处理

在冷启动场景中构建一个 QA 推荐聊天机器人

构建基于内容的系统

使用 Streamlit 开发前端

总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具