openai_api_key = 'YOUR_API_KEY'# 使用你自己的 OpenAI API key
一、文本总结 (Summarization)
将一段文本扔给 LLM,让它生成总结可以说是最常见的场景之一了。
1. 短文本总结
# Summaries Of Short Textfrom langchain.llms import OpenAI
from langchain import PromptTemplate
llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo', openai_api_key=openai_api_key) # 初始化 LLM 模型# 创建模板
template = """
%INSTRUCTIONS:
Please summarize the following piece of text.
Respond in a manner that a 5 year old would understand.
%TEXT:
{text}
"""# 创建一个 Lang Chain Prompt 模板,稍后可以插入值
prompt = PromptTemplate(
input_variables=["text"],
template=template,
)
confusing_text = """
For the next 130 years, debate raged.
Some scientists called Prototaxites a lichen, others a fungus, and still others clung to the notion that it was some kind of tree.
'The problem is that when you look up close at the anatomy, it's evocative of a lot of different things, but it's diagnostic of nothing,' says Boyce, an associate professor in geophysical sciences and the Committee on Evolutionary Biology.
'And it's so damn big that when whenever someone says it's something, everyone else's hackles get up: 'How could you have a lichen 20 feet tall?''
"""
print("------- Prompt Begin -------")
# 打印模板内容
final_prompt = prompt.format(text=confusing_text)
print(final_prompt)
print("------- Prompt End -------")
# Use it. This will run through the 36 documents, summarize the chunks, then get a summary of the summary.# 典型的 map reduce 的思路去解决问题,将文章拆分成多个部分,再将多个部分分别进行 summarize,最后再进行 合并,对 summaries 进行 summary
output = chain.run(docs)
print(output)
# Try yourself
from langchain import OpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
loader = TextLoader('wonderland.txt') # 载入一个长文本,我们还是使用爱丽丝漫游仙境这篇小说作为输入
doc = loader.load()
print(f"You have {len(doc)} document")
print(f"You have {len(doc[0].page_content)} characters in that document")
You have 1 document
You have 164014 characters in that document
# 获取字符的总数,以便可以计算平均值
num_total_characters = sum([len(x.page_content) for x in docs])
print(f"Now you have {len(docs)} documents that have an average of {num_total_characters / len(docs):,.0f} characters (smaller pieces)")
Now you have 62 documents that have an average of 2,846 characters (smaller pieces)
query = "What does the author describe the Alice following with?"
qa.run(query)
# 这个过程中,检索器会去获取类似的文件部分,并结合你的问题让 LLM 进行推理,最后得到答案# 这一步还有很多可以细究的步骤,比如如何选择最佳的分割大小,如何选择最佳的 embedding 引擎,如何选择最佳的检索器等等# 同时也可以选择云端向量存储
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
chat_model = ChatOpenAI(temperature=0, model='gpt-3.5-turbo', openai_api_key=openai_api_key)
# Vanilla Extraction
instructions = """
You will be given a sentence with fruit names, extract those fruit names and assign an emoji to them
Return the fruit name and emojis in a python dictionary
"""
fruit_names = """
Apple, Pear, this is an kiwi
"""
# Make your prompt which combines the instructions w/ the fruit names
prompt = (instructions + fruit_names)
# Call the LLM
output = chat_model([HumanMessage(content=prompt)])
print(output.content)
print(type(output.content))
# 解析输出并获取结构化的数据from langchain.output_parsers import StructuredOutputParser, ResponseSchema
response_schemas = [
ResponseSchema(name="artist", description="The name of the musical artist"),
ResponseSchema(name="song", description="The name of the song that the artist plays")
]
# 解析器将会把 LLM 的输出使用我定义的 schema 进行解析并返回期待的结构数据给我
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
# 这个 Prompt 与之前我们构建 Chat Model 时 Prompt 不同# 这个 Prompt 是一个 ChatPromptTemplate,它会自动将我们的输出转化为 python 对象
prompt = ChatPromptTemplate(
messages=[
HumanMessagePromptTemplate.from_template("Given a command from the user, extract the artist and song names \n \n {format_instructions}\n{user_prompt}")
],
input_variables=["user_prompt"],
partial_variables={"format_instructions": format_instructions}
)
artist_query = prompt.format_prompt(user_prompt="I really like So Young by Portugal. The Man")
print(artist_query.messages[0].content)
# Embeddings, store, and retrievalfrom langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# Model and doc loaderfrom langchain import OpenAI
from langchain.document_loaders import TextLoader
# Evalfrom langchain.evaluation.qa import QAEvalChain
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
# 还是使用爱丽丝漫游仙境作为文本输入
loader = TextLoader('wonderland.txt')
doc = loader.load()
print(f"You have {len(doc)} document")
print(f"You have {len(doc[0].page_content)} characters in that document")
You have 1 document
You have 164014 characters in that document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
docs = text_splitter.split_documents(doc)
# Get the total number of characters so we can see the average later
num_total_characters = sum([len(x.page_content) for x in docs])
print(f"Now you have {len(docs)} documents that have an average of {num_total_characters / len(docs):,.0f} characters (smaller pieces)")
Now you have 62 documents that have an average of 2,846 characters (smaller pieces)
question_answers = [
{'question' : "Which animal give alice a instruction?", 'answer' : 'rabbit'},
{'question' : "What is the author of the book", 'answer' : 'Elon Mask'}
]
[{'question': 'Which animal give alice a instruction?',
'answer': 'rabbit',
'result': ' The Caterpillar gave Alice instructions.'},
{'question': 'What is the author of the book',
'answer': 'Elon Mask',
'result': ' The author of the book is Lewis Carroll.'}]
# 使用自然语言查询一个 SQLite 数据库,我们将使用旧金山树木数据集# Don't run following code if you don't run sqlite and follow dbfrom langchain import OpenAI, SQLDatabase, SQLDatabaseChain
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
sqlite_db_path = 'data/San_Francisco_Trees.db'
db = SQLDatabase.from_uri(f"sqlite:///{sqlite_db_path}")
db_chain.run("How many Species of trees are there in San Francisco?")
Find which table to use
Find which column to use
Construct the correct sql query
Execute that query
Get the result
Return a natural language reponse back
confirm LLM result via pandas
import sqlite3
import pandas as pd
# Connect to the SQLite database
connection = sqlite3.connect(sqlite_db_path)
# Define your SQL query
query = "SELECT count(distinct qSpecies) FROM SFTrees"# Read the SQL query into a Pandas DataFrame
df = pd.read_sql_query(query, connection)
# Close the connection
connection.close()
# Display the result in the first column first cellprint(df.iloc[0,0])
六、代码理解 (Code Understanding)
代码理解用到的工具和文档问答差不多,不过我们的输入是一个项目的代码。
# Helper to read local filesimport os
# Vector Supportfrom langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
# Model and chainfrom langchain.chat_models import ChatOpenAI
# Text splittersfrom langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
llm = ChatOpenAI(model='gpt-3.5-turbo', openai_api_key=openai_api_key)
root_dir = '/content/drive/MyDrive/thefuzz-master'
docs = []
# Go through each folderfor dirpath, dirnames, filenames in os.walk(root_dir):
# Go through each filefor file in filenames:
try:
# Load up the file as a doc and split
loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
docs.extend(loader.load_and_split())
except Exception as e:
pass
print(f"You have {len(docs)} documents\n")
print("------ Start Document ------")
print(docs[0].page_content[:300])
You have 175 documents
------ Start Document ------
from timeit import timeit
import math
import csv
iterations = 100000
reader = csv.DictReader(open('data/titledata.csv'), delimiter='|')
titles = [i['custom_title'] for i in reader]
title_blob = '\n'.join(titles)
cirque_strings = [
"cirque du soleil - zarkana - las vegas",
"cirque du sol
# Get our retriever ready
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
query = "What function do I use if I want to find the most similar item in a list of items?"
output = qa.run(query)
print(output)
query = "Can you write the code to use the process.extractOne() function? Only respond with code. No other text or explanation"
output = qa.run(query)
print(output)
process.extractOne(query, choices)
七、API 交互 (Interacting with APIs)
如果你需要的数据或操作在 API 之后,就需要 LLM 能够和 API 进行交互。
到这个环节,就与 Agents 和 Plugins 息息相关了。
Demo 可能很简单,但是功能可以很复杂。
from langchain.chains import APIChain
from langchain.llms import OpenAI
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
api_docs = """
BASE URL: https://restcountries.com/
API Documentation:
The API endpoint /v3.1/name/{name} Used to find informatin about a country. All URL parameters are listed below:
- name: Name of country - Ex: italy, france
The API endpoint /v3.1/currency/{currency} Uesd to find information about a region. All URL parameters are listed below:
- currency: 3 letter currency. Example: USD, COP
Woo! This is my documentation
"""
chain_new = APIChain.from_llm_and_api_docs(llm, api_docs, verbose=True)
chain_new.run('Can you tell me information about france?')
' France is an officially-assigned, independent country located in Western Europe. Its capital is Paris'
chain_new.run('Can you tell me about the currency COP?')
' The currency of Colombia is the Colombian peso (COP), symbolized by the "$" sign.'
from langchain.llms import OpenAI
from langchain import LLMChain
from langchain.prompts.prompt import PromptTemplate
# Chat specific componentsfrom langchain.memory import ConversationBufferMemory
template = """
You are a chatbot that is unhelpful.
Your goal is to not help the user but only make jokes.
Take what the user is saying and make a joke out of it
{chat_history}
Human: {human_input}
Chatbot:"""
prompt = PromptTemplate(
input_variables=["chat_history", "human_input"],
template=template
)
memory = ConversationBufferMemory(memory_key="chat_history")
toolkit = [
Tool(
name = "Search",
func=search.run,
description="useful for when you need to search google to answer questions about current events"
),
Tool(
name = "Requests",
func=requests.get,
description="Useful for when you to make a request to a URL"
),
]