自然语言处理基础与文本分析系统开发

自然语言处理基础与文本分析系统开发 | 极客日志

pip install nltk

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")

text = "Barack Obama was born in Hawaii. He was elected president in 2008."
tokens = word_tokenize(text)
tags = pos_tag(tokens)
chunks = ne_chunk(tags)

print("Tokens:", tokens)
print("Tags:", tags)
print("Chunks:", chunks)

pip install spacy
python -m spacy download en_core_web_sm

import spacy

nlp = spacy.load("en_core_web_sm")
text = "Barack Obama was born in Hawaii. He was elected president in 2008."
doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

pip install transformers

from transformers import pipeline

# 文本分类
classifier = pipeline("text-classification")
text = "This is a great product. I really like it."
result = classifier(text)
print("Text Classification:", result)

# 情感分析
sentiment_analyzer = pipeline("sentiment-analysis")
result = sentiment_analyzer(text)
print("Sentiment Analysis:", result)

# 命名实体识别
ner = pipeline("ner")
text = "Barack Obama was born in Hawaii. He was elected president in 2008."
result = ner(text)
print("Named Entity Recognition:", result)

# 安装 NLTK 库
pip install nltk
# 安装 spaCy 库
pip install spacy
python -m spacy download en_core_web_sm
# 安装 Transformers 库
pip install transformers
# 安装 Flask 库
pip install flask
# 安装 Matplotlib 库
pip install matplotlib
# 安装 Seaborn 库
pip install seaborn

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re

nltk.download("punkt")
nltk.download("stopwords")

def preprocess_text(text):
    try:
        # 去除特殊字符和数字
        text = re.sub(r"[^a-zA-Z]", " ", text)
        # 小写化
        text = text.lower()
        # 分词
        tokens = word_tokenize(text)
        # 去除停用词和标点符号
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
        # 连接单词
        processed_text = " ".join(tokens)
        return processed_text
    except Exception as e:
        print(f"文本预处理失败：{e}")
        return None

from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")

# 文本分类
classifier = pipeline("text-classification")
# 情感分析
sentiment_analyzer = pipeline("sentiment-analysis")
# 命名实体识别
ner = pipeline("ner")

def analyze_text(text):
    try:
        # 文本分类
        classification_result = classifier(text)
        # 情感分析
        sentiment_result = sentiment_analyzer(text)
        # 命名实体识别
        ner_result = ner(text)
        # 词性标注
        tokens = word_tokenize(text)
        tags = pos_tag(tokens)
        # 命名实体识别（使用 NLTK）
        chunks = ne_chunk(tags)
        return {
            "classification": classification_result,
            "sentiment": sentiment_result,
            "ner": ner_result,
            "pos": tags,
            "ne": chunks
        }
    except Exception as e:
        print(f"文本分析失败：{e}")
        return None

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def visualize_results(analysis_result):
    try:
        # 文本分类结果可视化
        classification_df = pd.DataFrame(analysis_result["classification"])
        plt.figure(figsize=(10, 6))
        sns.barplot(x="label", y="score", data=classification_df)
        plt.title("Text Classification")
        plt.xlabel("Label")
        plt.ylabel("Score")
        plt.savefig("static/classification.png")
        plt.close()

        # 情感分析结果可视化
        sentiment_df = pd.DataFrame(analysis_result["sentiment"])
        plt.figure(figsize=(10, 6))
        sns.barplot(x="label", y="score", data=sentiment_df)
        plt.title("Sentiment Analysis")
        plt.xlabel("Label")
        plt.ylabel("Score")
        plt.savefig("static/sentiment.png")
        plt.close()

        # 命名实体识别结果可视化
        ner_df = pd.DataFrame(analysis_result["ner"])
        ner_df["length"] = ner_df["end"] - ner_df["start"]
        plt.figure(figsize=(10, 6))
        sns.barplot(x="entity", y="length", data=ner_df)
        plt.title("Named Entity Recognition")
        plt.xlabel("Entity")
        plt.ylabel("Length")
        plt.savefig("static/ner.png")
        plt.close()

        # 词性标注结果可视化
        pos_df = pd.DataFrame(analysis_result["pos"], columns=["Token", "POS"])
        pos_counts = pos_df["POS"].value_counts()
        plt.figure(figsize=(10, 6))
        sns.barplot(x=pos_counts.index, y=pos_counts.values)
        plt.title("Part of Speech")
        plt.xlabel("POS")
        plt.ylabel("Count")
        plt.savefig("static/pos.png")
        plt.close()
        return True
    except Exception as e:
        print(f"分析结果可视化失败：{e}")
        return False

from flask import Flask, render_template, request, redirect, url_for
import os
import uuid
from text_preprocessor import preprocess_text
from text_analyzer import analyze_text
from results_visualizer import visualize_results

app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads"
app.config["ALLOWED_EXTENSIONS"] = {"txt"}
app.config["STATIC_FOLDER"] = "static"

def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in app.config["ALLOWED_EXTENSIONS"]

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/analyze", methods=["POST"])
def analyze():
    text = ""
    filename = None
    if "file" in request.files and request.files["file"].filename != "":
        file = request.files["file"]
        if file and allowed_file(file.filename):
            filename = str(uuid.uuid4()) + "." + file.filename.rsplit(".", 1)[1].lower()
            file.save(os.path.join(app.config["UPLOAD_FOLDER"], filename))
            with open(os.path.join(app.config["UPLOAD_FOLDER"], filename), "r") as f:
                text = f.read()
    elif "text" in request.form and request.form["text"] != "":
        text = request.form["text"]
    else:
        return redirect(request.url)

    processed_text = preprocess_text(text)
    if processed_text is None:
        return render_template("result.html", error="文本预处理失败")

    analysis_result = analyze_text(processed_text)
    if analysis_result is None:
        return render_template("result.html", error="文本分析失败")

    visualization_result = visualize_results(analysis_result)
    if not visualization_result:
        return render_template("result.html", error="分析结果可视化失败")

    return render_template("result.html", filename=filename, analysis_result=analysis_result)

@app.route("/uploads/<filename>")
def uploaded_file(filename):
    return send_from_directory(app.config["UPLOAD_FOLDER"], filename)

@app.route("/static/<filename>")
def static_file(filename):
    return send_from_directory(app.config["STATIC_FOLDER"], filename)

if __name__ == "__main__":
    if not os.path.exists(app.config["UPLOAD_FOLDER"]):
        os.makedirs(app.config["UPLOAD_FOLDER"])
    if not os.path.exists(app.config["STATIC_FOLDER"]):
        os.makedirs(app.config["STATIC_FOLDER"])
    app.run(debug=True)

<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>文本分析系统</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            background-color: #f5f5f5;
        }
        .container {
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #fff;
            border-radius: 5px;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
            margin-top: 50px;
        }
        h1 {
            text-align: center;
            margin-bottom: 20px;
            color: #333;
        }
        .analyze-form {
            text-align: center;
            margin-bottom: 20px;
        }
        .analyze-form input[type="file"] {
            margin-bottom: 10px;
        }
        .analyze-form textarea {
            width: 100%;
            height: 200px;
            margin-bottom: 10px;
            padding: 10px;
            border: 1px solid #ddd;
            border-radius: 5px;
        }
        .analyze-form input[type="submit"] {
            padding: 10px 20px;
            background-color: #4CAF50;
            color: #fff;
            border: none;
            border-radius: 5px;
            cursor: pointer;
        }
        .analyze-form input[type="submit"]:hover {
            background-color: #45a049;
        }
        .result {
            text-align: center;
            margin-top: 20px;
        }
        .result img {
            max-width: 100%;
            height: auto;
            margin-bottom: 20px;
        }
        .result table {
            width: 100%;
            border-collapse: collapse;
            margin-bottom: 20px;
        }
        .result th, .result td {
            padding: 10px;
            border: 1px solid #ddd;
        }
        .result th {
            background-color: #f2f2f2;
            text-align: left;
        }
        .error {
            color: red;
            text-align: center;
            margin-top: 20px;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>文本分析系统</h1>
        <form class="analyze-form" method="POST" enctype="multipart/form-data" action="/analyze">
            <input type="file" name="file" accept=".txt"><br>
            <textarea name="text" placeholder="请输入文本内容"></textarea><br>
            <input type="submit" value="分析文本">
        </form>
        {% if error %} 
        <div class="error">{{ error }}</div> 
        {% endif %} 
        {% if analysis_result %} 
        <div class="result">
            <h2>文本分类结果</h2>
            <img src="{{ url_for('static_file', filename='classification.png') }}" alt="文本分类结果">
            <table>
                <tr><th>标签</th><th>置信度</th></tr>
                {% for item in analysis_result.classification %} 
                <tr><td>{{ item.label }}</td><td>{{ item.score }}</td></tr> 
                {% endfor %} 
            </table>
            <h2>情感分析结果</h2>
            <img src="{{ url_for('static_file', filename='sentiment.png') }}" alt="情感分析结果">
            <table>
                <tr><th>标签</th><th>置信度</th></tr>
                {% for item in analysis_result.sentiment %} 
                <tr><td>{{ item.label }}</td><td>{{ item.score }}</td></tr> 
                {% endfor %} 
            </table>
            <h2>命名实体识别结果</h2>
            <img src="{{ url_for('static_file', filename='ner.png') }}" alt="命名实体识别结果">
            <table>
                <tr><th>实体</th><th>起始位置</th><th>结束位置</th><th>标签</th></tr>
                {% for item in analysis_result.ner %} 
                <tr><td>{{ item.word }}</td><td>{{ item.start }}</td><td>{{ item.end }}</td><td>{{ item.entity }}</td></tr> 
                {% endfor %} 
            </table>
            <h2>词性标注结果</h2>
            <img src="{{ url_for('static_file', filename='pos.png') }}" alt="词性标注结果">
            <table>
                <tr><th>单词</th><th>词性</th></tr>
                {% for item in analysis_result.pos %} 
                <tr><td>{{ item.0 }}</td><td>{{ item.1 }}</td></tr> 
                {% endfor %} 
            </table>
        </div> 
        {% endif %} 
    </div>
</body>
</html>

Barack Obama was born in Hawaii. He was elected president in 2008. This is a great product. I really like it.

自然语言处理基础与文本分析系统开发

自然语言处理基础与文本分析系统开发

学习目标

重点内容

一、自然语言处理基础

1.1 自然语言处理的基本概念

1.1.1 自然语言处理的重要性

1.1.2 自然语言处理的应用场景

1.2 自然语言处理的基本任务

1.2.1 分词

1.2.2 词性标注

1.2.3 命名实体识别

1.2.4 语法分析

1.2.5 语义分析

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

二、自然语言处理库介绍

2.1 NLTK 介绍

2.1.1 NLTK 的安装

2.1.2 NLTK 的基本使用

2.2 spaCy 介绍

2.2.1 spaCy 的安装

2.2.2 spaCy 的基本使用

2.3 Transformers 介绍

2.3.1 Transformers 的安装

2.3.2 Transformers 的基本使用

三、文本预处理方法

3.1 文本清洗

3.2 文本向量化

四、文本分析任务

4.1 文本分类

4.2 情感分析

4.3 命名实体识别

五、实战项目：文本分析系统开发

5.1 项目需求分析

5.1.1 应用目标

5.1.2 用户需求

5.1.3 功能范围

5.2 系统架构设计

5.2.1 应用架构

5.2.2 数据存储方案

5.3 系统实现

5.3.1 开发环境搭建

5.3.2 文本预处理

5.3.3 文本分析

5.3.4 分析结果可视化

5.3.5 用户界面

5.3.6 前端界面

5.4 系统运行与测试

5.4.1 系统运行

5.4.2 系统测试

六、总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具