跳到主要内容
极客日志极客日志
首页博客AI提示词GitHub精选代理工具
搜索
|注册
博客列表
CAI算法

C4.5 决策树算法原理与 C 语言实现详解

基于 C 语言实现的 C4.5 决策树算法,涵盖信息熵与信息增益率计算、树的递归构建及分类预测。通过天气数据集示例,演示数据结构设计、核心函数逻辑及完整运行流程,解决多取值属性偏好问题,提供源码与结果分析。

二进制发布于 2026/3/15更新于 2026/5/46 浏览

C4.5 决策树算法原理与 C 语言实现

1. 算法背景与目标

C4.5 是 ID3 算法的改进版,核心在于使用**信息增益率(Gain Ratio)**作为划分标准。这有效避免了 ID3 偏向于取值较多属性的缺陷,提升了模型的泛化能力。

本次实战基于经典的天气数据集(Outlook, Temperature, Humidity, Windy),目标是构建分类模型预测是否适合运动(Play)。

文章配图

2. 核心数学原理

2.1 信息熵 (Entropy)

熵衡量样本集合的不确定性。纯度越高,熵越低。

文章配图

文章配图

2.2 信息增益与分裂信息

信息增益表示得知属性后不确定性的减少量。但为了修正多值偏好,需引入分裂信息(Split Information)。

文章配图

2.3 信息增益率

C4.5 的核心指标,即信息增益除以分裂信息。

文章配图

2.4 生成过程

算法采用递归分治策略:选择最优属性 -> 划分数据 -> 递归构建子树。

文章配图

3. 数据结构设计

在 C 语言中,我们使用结构体来抽象样本和树节点。Sample 对应训练集的一行数据,Node 则描述树的层级关系。

# MAX_SAMPLES 20




 
     attribute[MAX_VALUE_LEN];
     label[MAX_VALUE_LEN];
     is_leaf;
     branch_count;
     branch_values[MAX_SAMPLES][MAX_VALUE_LEN];
    
} Node;


 
     Outlook[MAX_VALUE_LEN];
     Temperature[MAX_VALUE_LEN];
     Humidity[MAX_VALUE_LEN];
     Windy[MAX_VALUE_LEN];
     Play[MAX_VALUE_LEN];
} Sample;
define
#define MAX_ATTR 5
#define MAX_VALUE_LEN 20
// 决策树节点定义
typedef
struct Node {
char
char
int
int
char
struct Node *children[MAX_SAMPLES];
// 样本数据定义
typedef
struct {
char
char
char
char
char

4. 核心功能实现

4.1 辅助函数与熵计算

首先封装字符串比较工具,并实现香农熵计算。当子集为空或纯净时,熵为 0。

int equals(const char *a, const char *b) {
    return strcmp(a, b) == 0;
}

double entropy(Sample *data, int size) {
    if (size == 0) return 0.0;
    int count_yes = 0, count_no = 0;
    for (int i = 0; i < size; i++) {
        if (equals(data[i].Play, "Yes")) count_yes++;
        else count_no++;
    }
    double p1 = (double)count_yes / size, p2 = (double)count_no / size;
    double e = 0.0;
    if (p1 > 0) e -= p1 * log2(p1);
    if (p2 > 0) e -= p2 * log2(p2);
    return e;
}

4.2 分裂信息与增益率

split_info 计算属性的内在信息值,用于消除多值属性偏差。gain_ratio 则是最终的选择依据。

double split_info(Sample *data, int size, char *attr) {
    if (size == 0) return 0;
    char values[MAX_SAMPLES][MAX_VALUE_LEN];
    int counts[MAX_SAMPLES] = {0}, value_count = 0;
    // 统计各属性值的分布...
    // (此处省略具体遍历逻辑,见完整代码)
    double si = 0;
    for (int i = 0; i < value_count; i++) {
        double p = (double)counts[i] / size;
        si -= p * log2(p);
    }
    return si;
}

double gain_ratio(Sample *data, int size, char *attr) {
    double ig = info_gain(data, size, attr, 0);
    double si = split_info(data, size, attr);
    return (si == 0) ? 0 : ig / si;
}

4.3 递归构建树

这是算法最关键的步骤。build_tree 负责判断终止条件(标签一致或属性耗尽),选择最佳分裂点,并递归调用自身。

Node *build_tree(Sample *data, int size, char **attrs, int attr_count) {
    Node *node = (Node *)malloc(sizeof(Node));
    memset(node, 0, sizeof(Node));
    
    // 终止条件:标签全同或无可用属性
    if (all_same_label(data, size) || attr_count == 0) {
        node->is_leaf = 1;
        strcpy(node->label, all_same_label(data, size) ? data[0].Play : majority_label(data, size));
        return node;
    }
    
    // 寻找最佳分裂属性
    double best_gr = -1;
    int best_attr = -1;
    for (int i = 0; i < attr_count; i++) {
        double gr = gain_ratio(data, size, attrs[i]);
        if (gr > best_gr) {
            best_gr = gr;
            best_attr = i;
        }
    }
    
    strcpy(node->attribute, attrs[best_attr]);
    node->is_leaf = 0;
    
    // 划分数据并递归
    // ... (具体划分逻辑见完整代码)
    return node;
}

5. 完整代码与运行分析

将上述模块整合,包含主程序入口、打印可视化及规则提取功能。编译运行后,程序会输出原始数据、各属性增益率计算过程、生成的决策树结构以及测试集的预测结果。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#define MAX_SAMPLES 20
#define MAX_ATTR 5
#define MAX_VALUE_LEN 20
#define MAX_RULE_LEN 256

// 结构体定义
typedef struct Node {
    char attribute[MAX_VALUE_LEN];
    char label[MAX_VALUE_LEN];
    int is_leaf;
    int branch_count;
    char branch_values[MAX_SAMPLES][MAX_VALUE_LEN];
    struct Node *children[MAX_SAMPLES];
} Node;

typedef struct {
    char Outlook[MAX_VALUE_LEN];
    char Temperature[MAX_VALUE_LEN];
    char Humidity[MAX_VALUE_LEN];
    char Windy[MAX_VALUE_LEN];
    char Play[MAX_VALUE_LEN];
} Sample;

// 训练数据集
Sample data[MAX_SAMPLES] = {
    {"sunny", "hot", "high", "false", "No"},
    {"sunny", "hot", "high", "true", "No"},
    {"overcast", "hot", "high", "false", "Yes"},
    {"rain", "mild", "high", "false", "Yes"},
    {"rain", "cool", "normal", "false", "Yes"},
    {"rain", "cool", "normal", "true", "No"},
    {"overcast", "cool", "normal", "true", "Yes"},
    {"sunny", "mild", "high", "false", "No"},
    {"sunny", "cool", "normal", "false", "Yes"},
    {"rain", "mild", "normal", "false", "Yes"},
    {"sunny", "mild", "normal", "true", "Yes"},
    {"overcast", "mild", "high", "true", "Yes"},
    {"overcast", "hot", "normal", "false", "Yes"},
    {"rain", "mild", "high", "true", "No"}
};
int total_samples = 14;
char *attributes[] = {"Outlook", "Temperature", "Humidity", "Windy"};
int attr_count = 4;

// 辅助函数
int equals(const char *a, const char *b) { return strcmp(a, b) == 0; }

// 熵计算
double entropy(Sample *data, int size) {
    if (size == 0) return 0.0;
    int count_yes = 0, count_no = 0;
    for (int i = 0; i < size; i++) {
        if (equals(data[i].Play, "Yes")) count_yes++;
        else count_no++;
    }
    double p1 = (double)count_yes / size, p2 = (double)count_no / size;
    double e = 0.0;
    if (p1 > 0) e -= p1 * log2(p1);
    if (p2 > 0) e -= p2 * log2(p2);
    return e;
}

// 分裂信息
double split_info(Sample *data, int size, char *attr) {
    if (size == 0) return 0;
    char values[MAX_SAMPLES][MAX_VALUE_LEN];
    int counts[MAX_SAMPLES] = {0}, value_count = 0;
    for (int i = 0; i < size; i++) {
        char *val;
        if (equals(attr, "Outlook")) val = data[i].Outlook;
        else if (equals(attr, "Temperature")) val = data[i].Temperature;
        else if (equals(attr, "Humidity")) val = data[i].Humidity;
        else val = data[i].Windy;
        int found = 0;
        for (int j = 0; j < value_count; j++)
            if (equals(values[j], val)) { counts[j]++; found = 1; break; }
        if (!found) { strcpy(values[value_count], val); counts[value_count++] = 1; }
    }
    double si = 0;
    for (int i = 0; i < value_count; i++) {
        double p = (double)counts[i] / size;
        si -= p * log2(p);
    }
    return si;
}

// 信息增益
double info_gain(Sample *data, int size, char *attr, int print_detail) {
    double total_entropy = entropy(data, size);
    char values[MAX_SAMPLES][MAX_VALUE_LEN];
    int value_count = 0;
    for (int i = 0; i < size; i++) {
        char *val;
        if (equals(attr, "Outlook")) val = data[i].Outlook;
        else if (equals(attr, "Temperature")) val = data[i].Temperature;
        else if (equals(attr, "Humidity")) val = data[i].Humidity;
        else val = data[i].Windy;
        int found = 0;
        for (int j = 0; j < value_count; j++)
            if (equals(values[j], val)) { found = 1; break; }
        if (!found) strcpy(values[value_count++], val);
    }
    double weighted_entropy = 0;
    if (print_detail) printf(" 子集熵计算:\n");
    for (int i = 0; i < value_count; i++) {
        Sample subset[MAX_SAMPLES];
        int subset_size = 0;
        for (int j = 0; j < size; j++) {
            char *val;
            if (equals(attr, "Outlook")) val = data[j].Outlook;
            else if (equals(attr, "Temperature")) val = data[j].Temperature;
            else if (equals(attr, "Humidity")) val = data[j].Humidity;
            else val = data[j].Windy;
            if (equals(val, values[i])) subset[subset_size++] = data[j];
        }
        double subset_entropy = entropy(subset, subset_size);
        double p = (double)subset_size / size;
        weighted_entropy += p * subset_entropy;
        if (print_detail) printf(" %-10s: 熵=%.4f, 权重=%.2f, 贡献=%.4f\n", values[i], subset_entropy, p, p * subset_entropy);
    }
    if (print_detail) {
        printf(" Info_%s(D) = %.4f\n", attr, weighted_entropy);
        printf(" 信息增益计算:%.4f - %.4f = %.4f\n\n", total_entropy, weighted_entropy, total_entropy - weighted_entropy);
    }
    return total_entropy - weighted_entropy;
}

// 信息增益率
double gain_ratio(Sample *data, int size, char *attr) {
    double ig = info_gain(data, size, attr, 0);
    double si = split_info(data, size, attr);
    return (si == 0) ? 0 : ig / si;
}

// 辅助判断
double all_same_label(Sample *data, int size) {
    for (int i = 1; i < size; i++)
        if (!equals(data[0].Play, data[i].Play)) return 0;
    return 1;
}

const char *majority_label(Sample *data, int size) {
    int count_yes = 0;
    for (int i = 0; i < size; i++)
        if (equals(data[i].Play, "Yes")) count_yes++;
    return (count_yes >= size - count_yes) ? "Yes" : "No";
}

// 构建决策树
Node *build_tree(Sample *data, int size, char **attrs, int attr_count) {
    Node *node = (Node *)malloc(sizeof(Node));
    memset(node, 0, sizeof(Node));
    if (all_same_label(data, size) || attr_count == 0) {
        node->is_leaf = 1;
        strcpy(node->label, all_same_label(data, size) ? data[0].Play : majority_label(data, size));
        return node;
    }
    double best_gr = -1;
    int best_attr = -1;
    for (int i = 0; i < attr_count; i++) {
        double gr = gain_ratio(data, size, attrs[i]);
        if (gr > best_gr) {
            best_gr = gr;
            best_attr = i;
        }
    }
    strcpy(node->attribute, attrs[best_attr]);
    node->is_leaf = 0;
    char values[MAX_SAMPLES][MAX_VALUE_LEN];
    int value_count = 0;
    for (int i = 0; i < size; i++) {
        char *val;
        if (equals(attrs[best_attr], "Outlook")) val = data[i].Outlook;
        else if (equals(attrs[best_attr], "Temperature")) val = data[i].Temperature;
        else if (equals(attrs[best_attr], "Humidity")) val = data[i].Humidity;
        else val = data[i].Windy;
        int found = 0;
        for (int j = 0; j < value_count; j++)
            if (equals(values[j], val)) { found = 1; break; }
        if (!found) strcpy(values[value_count++], val);
    }
    for (int i = 0; i < value_count; i++) {
        Sample subset[MAX_SAMPLES];
        int subset_size = 0;
        for (int j = 0; j < size; j++) {
            char *val;
            if (equals(attrs[best_attr], "Outlook")) val = data[j].Outlook;
            else if (equals(attrs[best_attr], "Temperature")) val = data[j].Temperature;
            else if (equals(attrs[best_attr], "Humidity")) val = data[j].Humidity;
            else val = data[j].Windy;
            if (equals(values[i], val)) subset[subset_size++] = data[j];
        }
        char *remaining_attrs[MAX_ATTR];
        int new_count = 0;
        for (int k = 0; k < attr_count; k++)
            if (k != best_attr) remaining_attrs[new_count++] = attrs[k];
        node->children[node->branch_count] = build_tree(subset, subset_size, remaining_attrs, new_count);
        strcpy(node->branch_values[node->branch_count++], values[i]);
    }
    return node;
}

// 打印树
void print_tree(Node *node, int level) {
    if (node->is_leaf) {
        for (int i = 0; i < level; i++) printf(" ");
        printf("Leaf: %s\n", node->label);
        return;
    }
    for (int i = 0; i < level; i++) printf(" ");
    printf("Node: %s\n", node->attribute);
    for (int i = 0; i < node->branch_count; i++) {
        for (int j = 0; j < level; j++) printf(" ");
        printf(" [%s = %s]\n", node->attribute, node->branch_values[i]);
        print_tree(node->children[i], level + 2);
    }
}

// 预测
const char *predict(Node *tree, Sample *sample) {
    if (tree->is_leaf) return tree->label;
    const char *val;
    if (equals(tree->attribute, "Outlook")) val = sample->Outlook;
    else if (equals(tree->attribute, "Temperature")) val = sample->Temperature;
    else if (equals(tree->attribute, "Humidity")) val = sample->Humidity;
    else val = sample->Windy;
    for (int i = 0; i < tree->branch_count; i++)
        if (equals(tree->branch_values[i], val)) return predict(tree->children[i], sample);
    return "Unknown";
}

// 提取规则
void extract_rules(Node *tree, char *current_rule, int *rule_index) {
    if (tree->is_leaf) {
        printf("规则 %d: %s => %s\n", (*rule_index)++, current_rule, tree->label);
        return;
    }
    for (int i = 0; i < tree->branch_count; i++) {
        char new_rule[MAX_RULE_LEN];
        if (strlen(current_rule) == 0) snprintf(new_rule, sizeof(new_rule), "%s=%s", tree->attribute, tree->branch_values[i]);
        else snprintf(new_rule, sizeof(new_rule), "%s AND %s=%s", current_rule, tree->attribute, tree->branch_values[i]);
        extract_rules(tree->children[i], new_rule, rule_index);
    }
}

// 主程序
int main() {
    printf("原始数据:\n");
    printf("========================================\n");
    printf(" %-10s %-12s %-10s %-7s %-5s\n", "Outlook", "Temperature", "Humidity", "Windy", "Play?");
    printf("----------------------------------------\n");
    for (int i = 0; i < total_samples; i++)
        printf(" %-10s %-12s %-10s %-7s %-5s\n", data[i].Outlook, data[i].Temperature, data[i].Humidity, data[i].Windy, data[i].Play);
    printf("\n================================================================================\n\n");

    printf("2. 计算各属性的信息增益率与熵:\n");
    printf("--------------------------------------------------------------------------------\n");
    double total_entropy = entropy(data, total_samples);
    printf("数据集总熵:%.4f\n\n", total_entropy);
    for (int i = 0; i < attr_count; i++) {
        double ig = info_gain(data, total_samples, attributes[i], 1);
        double si = split_info(data, total_samples, attributes[i]);
        double gr = (si == 0) ? 0 : ig / si;
        printf("%s:\n", attributes[i]);
        printf(" 信息增益 (Info Gain): %.4f\n", ig);
        printf(" 分裂信息 (Split Info): %.4f\n", si);
        printf(" 信息增益率 (Gain Ratio): %.4f\n\n", gr);
    }
    printf("================================================================================\n\n");

    printf("3. 选择根节点分裂属性:\n");
    printf("--------------------------------------------------------------------------------\n");
    double best_gr = -1;
    int best_index = -1;
    for (int i = 0; i < attr_count; i++) {
        double gr = gain_ratio(data, total_samples, attributes[i]);
        if (gr > best_gr) {
            best_gr = gr;
            best_index = i;
        }
    }
    printf("最佳分裂属性:%s\n信息增益率:%.4f\n", attributes[best_index], best_gr);
    printf("\n================================================================================\n\n");

    printf("4. 建立决策树:\n");
    printf("--------------------------------------------------------------------------------\n");
    Node *tree = build_tree(data, total_samples, attributes, attr_count);
    printf("决策树结构:\n");
    print_tree(tree, 0);
    printf("================================================================================\n\n");

    printf("5. 对新输入进行分类预测:\n");
    printf("--------------------------------------------------------------------------------\n");
    Sample tests[] = {
        {"sunny", "cool", "high", "true", ""},
        {"overcast", "hot", "normal", "false", ""},
        {"rain", "mild", "high", "false", ""},
        {"sunny", "mild", "normal", "false", ""},
        {"rain", "cool", "normal", "true", ""}
    };
    int test_count = 5;
    for (int i = 0; i < test_count; i++) {
        const char *pred = predict(tree, &tests[i]);
        printf("测试样本 %d:\n", i + 1);
        printf(" 输入:Outlook=%-8s Temperature=%-6s Humidity=%-8s Windy=%-5s\n", tests[i].Outlook, tests[i].Temperature, tests[i].Humidity, tests[i].Windy);
        printf(" 预测结果:%s\n\n", pred);
    }
    printf("================================================================================\n");
    printf("训练集预测准确率:\n");
    printf("--------------------------------------------------------------------------------\n");
    int correct = 0;
    for (int i = 0; i < total_samples; i++) {
        const char *pred = predict(tree, &data[i]);
        if (equals(pred, data[i].Play)) correct++;
    }
    double acc = (double)correct / total_samples * 100.0;
    printf("准确率:%d/%d = %.2f%%\n", correct, total_samples, acc);
    printf("================================================================================\n\n");

    printf("6. 提取决策规则:\n");
    printf("--------------------------------------------------------------------------------\n");
    int rule_index = 1;
    extract_rules(tree, "", &rule_index);
    printf("================================================================================\n");
    return 0;
}

通过这段代码,你可以清晰地看到 C4.5 如何一步步从数据中'生长'出决策树。在实际应用中,只需替换 data 数组即可适配新的业务场景。

目录

  1. C4.5 决策树算法原理与 C 语言实现
  2. 1. 算法背景与目标
  3. 2. 核心数学原理
  4. 2.1 信息熵 (Entropy)
  5. 2.2 信息增益与分裂信息
  6. 2.3 信息增益率
  7. 2.4 生成过程
  8. 3. 数据结构设计
  9. 4. 核心功能实现
  10. 4.1 辅助函数与熵计算
  11. 4.2 分裂信息与增益率
  12. 4.3 递归构建树
  13. 5. 完整代码与运行分析
  • 💰 8折买阿里云服务器限时8折了解详情
  • GPT-5.5 超高智商模型1元抵1刀ChatGPT中转购买
  • 代充Chatgpt Plus/pro 帐号了解详情
  • 🤖 一键搭建Deepseek满血版了解详情
  • 一键打造专属AI 智能体了解详情
极客日志微信公众号二维码

微信扫一扫,关注极客日志

微信公众号「极客日志V2」,在微信中扫描左侧二维码关注。展示文案:极客日志V2 zeeklog

更多推荐文章

查看全部
  • Chrome DevTools MCP 实战:从安装到自动化测试全解析
  • 近端策略优化算法 (PPO) 详解与 PyTorch 实现
  • C++ 高并发内存池实战:ThreadCache 设计与实现
  • Ubuntu 部署 OpenClaw 完整教程
  • 力扣第 46、47 题:全排列与去重全排列算法解析
  • 数据结构初阶:堆的实现
  • Python Web 框架对比与实战:Django vs Flask vs FastAPI
  • OpenClaw 配置飞书机器人与 Kimi 2.5 接入指南
  • OpenClaw.ai:Agentic AI 时代的 Spring Framework 时刻
  • 直流无刷电机 FOC 控制算法
  • Python 中 GraphQL 的实战指南:从 Schema 设计到性能优化
  • GitHub 学生认证与 PyCharm 配置 Copilot 流程指南
  • 2026 年 Web 前端开发的 8 个趋势
  • FPGA 入门指南:从环境搭建到 LED 流水灯实战
  • Flutter pathfinding 库在 OpenHarmony 上的适配与实战
  • Python @dataclass 装饰器详解
  • SDXL Prompt Styler:AI 绘画风格控制与提示词工程优化方案
  • Linux poll 多路复用详解:select 的改进与局限
  • 垂直领域大模型构建:RAG 与微调的权衡与实践
  • Visual C++ 运行库整合包 vcredistAIO 安装与使用指南

相关免费在线工具

  • 加密/解密文本

    使用加密算法(如AES、TripleDES、Rabbit或RC4)加密和解密文本明文。 在线工具,加密/解密文本在线工具,online

  • RSA密钥对生成器

    生成新的随机RSA私钥和公钥pem证书。 在线工具,RSA密钥对生成器在线工具,online

  • Mermaid 预览与可视化编辑

    基于 Mermaid.js 实时预览流程图、时序图等图表,支持源码编辑与即时渲染。 在线工具,Mermaid 预览与可视化编辑在线工具,online

  • 随机西班牙地址生成器

    随机生成西班牙地址(支持马德里、加泰罗尼亚、安达卢西亚、瓦伦西亚筛选),支持数量快捷选择、显示全部与下载。 在线工具,随机西班牙地址生成器在线工具,online

  • Gemini 图片去水印

    基于开源反向 Alpha 混合算法去除 Gemini/Nano Banana 图片水印,支持批量处理与下载。 在线工具,Gemini 图片去水印在线工具,online

  • Base64 字符串编码/解码

    将字符串编码和解码为其 Base64 格式表示形式即可。 在线工具,Base64 字符串编码/解码在线工具,online