#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#define MAX_SAMPLES 20
#define MAX_ATTR 5
#define MAX_VALUE_LEN 20
#define MAX_RULE_LEN 256
typedef struct Node {
char attribute[MAX_VALUE_LEN];
char label[MAX_VALUE_LEN];
int is_leaf;
int branch_count;
char branch_values[MAX_SAMPLES][MAX_VALUE_LEN];
struct Node *children[MAX_SAMPLES];
} Node;
typedef struct {
char Outlook[MAX_VALUE_LEN];
char Temperature[MAX_VALUE_LEN];
char Humidity[MAX_VALUE_LEN];
char Windy[MAX_VALUE_LEN];
char Play[MAX_VALUE_LEN];
} Sample;
Sample data[MAX_SAMPLES] = {
{"sunny", "hot", "high", "false", "No"},
{"sunny", "hot", "high", "true", "No"},
{"overcast", "hot", "high", "false", "Yes"},
{"rain", "mild", "high", "false", "Yes"},
{"rain", "cool", "normal", "false", "Yes"},
{"rain", "cool", "normal", "true", "No"},
{"overcast", "cool", "normal", "true", "Yes"},
{"sunny", "mild", "high", "false", "No"},
{"sunny", "cool", "normal", "false", "Yes"},
{"rain", "mild", "normal", "false", "Yes"},
{"sunny", "mild", "normal", "true", "Yes"},
{"overcast", "mild", "high", "true", "Yes"},
{"overcast", "hot", "normal", "false", "Yes"},
{"rain", "mild", "high", "true", "No"}
};
int total_samples = 14;
char *attributes[] = {"Outlook", "Temperature", "Humidity", "Windy"};
int attr_count = 4;
int equals(const char *a, const char *b) { return strcmp(a, b) == 0; }
double entropy(Sample *data, int size) {
if (size == 0) return 0.0;
int count_yes = 0, count_no = 0;
for (int i = 0; i < size; i++) {
if (equals(data[i].Play, "Yes")) count_yes++;
else count_no++;
}
double p1 = (double)count_yes / size, p2 = (double)count_no / size;
double e = 0.0;
if (p1 > 0) e -= p1 * log2(p1);
if (p2 > 0) e -= p2 * log2(p2);
return e;
}
double split_info(Sample *data, int size, char *attr) {
if (size == 0) return 0;
char values[MAX_SAMPLES][MAX_VALUE_LEN];
int counts[MAX_SAMPLES] = {0}, value_count = 0;
for (int i = 0; i < size; i++) {
char *val;
if (equals(attr, "Outlook")) val = data[i].Outlook;
else if (equals(attr, "Temperature")) val = data[i].Temperature;
else if (equals(attr, "Humidity")) val = data[i].Humidity;
else val = data[i].Windy;
int found = 0;
for (int j = 0; j < value_count; j++)
if (equals(values[j], val)) { counts[j]++; found = 1; break; }
if (!found) { strcpy(values[value_count], val); counts[value_count++] = 1; }
}
double si = 0;
for (int i = 0; i < value_count; i++) {
double p = (double)counts[i] / size;
si -= p * log2(p);
}
return si;
}
double info_gain(Sample *data, int size, char *attr, int print_detail) {
double total_entropy = entropy(data, size);
char values[MAX_SAMPLES][MAX_VALUE_LEN];
int value_count = 0;
for (int i = 0; i < size; i++) {
char *val;
if (equals(attr, "Outlook")) val = data[i].Outlook;
else if (equals(attr, "Temperature")) val = data[i].Temperature;
else if (equals(attr, "Humidity")) val = data[i].Humidity;
else val = data[i].Windy;
int found = 0;
for (int j = 0; j < value_count; j++)
if (equals(values[j], val)) { found = 1; break; }
if (!found) strcpy(values[value_count++], val);
}
double weighted_entropy = 0;
if (print_detail) printf(" 子集熵计算:\n");
for (int i = 0; i < value_count; i++) {
Sample subset[MAX_SAMPLES];
int subset_size = 0;
for (int j = 0; j < size; j++) {
char *val;
if (equals(attr, "Outlook")) val = data[j].Outlook;
else if (equals(attr, "Temperature")) val = data[j].Temperature;
else if (equals(attr, "Humidity")) val = data[j].Humidity;
else val = data[j].Windy;
if (equals(val, values[i])) subset[subset_size++] = data[j];
}
double subset_entropy = entropy(subset, subset_size);
double p = (double)subset_size / size;
weighted_entropy += p * subset_entropy;
if (print_detail) printf(" %-10s: 熵=%.4f, 权重=%.2f, 贡献=%.4f\n", values[i], subset_entropy, p, p * subset_entropy);
}
if (print_detail) {
printf(" Info_%s(D) = %.4f\n", attr, weighted_entropy);
printf(" 信息增益计算:%.4f - %.4f = %.4f\n\n", total_entropy, weighted_entropy, total_entropy - weighted_entropy);
}
return total_entropy - weighted_entropy;
}
double gain_ratio(Sample *data, int size, char *attr) {
double ig = info_gain(data, size, attr, 0);
double si = split_info(data, size, attr);
return (si == 0) ? 0 : ig / si;
}
double all_same_label(Sample *data, int size) {
for (int i = 1; i < size; i++)
if (!equals(data[0].Play, data[i].Play)) return 0;
return 1;
}
const char *majority_label(Sample *data, int size) {
int count_yes = 0;
for (int i = 0; i < size; i++)
if (equals(data[i].Play, "Yes")) count_yes++;
return (count_yes >= size - count_yes) ? "Yes" : "No";
}
Node *build_tree(Sample *data, int size, char **attrs, int attr_count) {
Node *node = (Node *)malloc(sizeof(Node));
memset(node, 0, sizeof(Node));
if (all_same_label(data, size) || attr_count == 0) {
node->is_leaf = 1;
strcpy(node->label, all_same_label(data, size) ? data[0].Play : majority_label(data, size));
return node;
}
double best_gr = -1;
int best_attr = -1;
for (int i = 0; i < attr_count; i++) {
double gr = gain_ratio(data, size, attrs[i]);
if (gr > best_gr) {
best_gr = gr;
best_attr = i;
}
}
strcpy(node->attribute, attrs[best_attr]);
node->is_leaf = 0;
char values[MAX_SAMPLES][MAX_VALUE_LEN];
int value_count = 0;
for (int i = 0; i < size; i++) {
char *val;
if (equals(attrs[best_attr], "Outlook")) val = data[i].Outlook;
else if (equals(attrs[best_attr], "Temperature")) val = data[i].Temperature;
else if (equals(attrs[best_attr], "Humidity")) val = data[i].Humidity;
else val = data[i].Windy;
int found = 0;
for (int j = 0; j < value_count; j++)
if (equals(values[j], val)) { found = 1; break; }
if (!found) strcpy(values[value_count++], val);
}
for (int i = 0; i < value_count; i++) {
Sample subset[MAX_SAMPLES];
int subset_size = 0;
for (int j = 0; j < size; j++) {
char *val;
if (equals(attrs[best_attr], "Outlook")) val = data[j].Outlook;
else if (equals(attrs[best_attr], "Temperature")) val = data[j].Temperature;
else if (equals(attrs[best_attr], "Humidity")) val = data[j].Humidity;
else val = data[j].Windy;
if (equals(values[i], val)) subset[subset_size++] = data[j];
}
char *remaining_attrs[MAX_ATTR];
int new_count = 0;
for (int k = 0; k < attr_count; k++)
if (k != best_attr) remaining_attrs[new_count++] = attrs[k];
node->children[node->branch_count] = build_tree(subset, subset_size, remaining_attrs, new_count);
strcpy(node->branch_values[node->branch_count++], values[i]);
}
return node;
}
void print_tree(Node *node, int level) {
if (node->is_leaf) {
for (int i = 0; i < level; i++) printf(" ");
printf("Leaf: %s\n", node->label);
return;
}
for (int i = 0; i < level; i++) printf(" ");
printf("Node: %s\n", node->attribute);
for (int i = 0; i < node->branch_count; i++) {
for (int j = 0; j < level; j++) printf(" ");
printf(" [%s = %s]\n", node->attribute, node->branch_values[i]);
print_tree(node->children[i], level + 2);
}
}
const char *predict(Node *tree, Sample *sample) {
if (tree->is_leaf) return tree->label;
const char *val;
if (equals(tree->attribute, "Outlook")) val = sample->Outlook;
else if (equals(tree->attribute, "Temperature")) val = sample->Temperature;
else if (equals(tree->attribute, "Humidity")) val = sample->Humidity;
else val = sample->Windy;
for (int i = 0; i < tree->branch_count; i++)
if (equals(tree->branch_values[i], val)) return predict(tree->children[i], sample);
return "Unknown";
}
void extract_rules(Node *tree, char *current_rule, int *rule_index) {
if (tree->is_leaf) {
printf("规则 %d: %s => %s\n", (*rule_index)++, current_rule, tree->label);
return;
}
for (int i = 0; i < tree->branch_count; i++) {
char new_rule[MAX_RULE_LEN];
if (strlen(current_rule) == 0) snprintf(new_rule, sizeof(new_rule), "%s=%s", tree->attribute, tree->branch_values[i]);
else snprintf(new_rule, sizeof(new_rule), "%s AND %s=%s", current_rule, tree->attribute, tree->branch_values[i]);
extract_rules(tree->children[i], new_rule, rule_index);
}
}
int main() {
printf("原始数据:\n");
printf("========================================\n");
printf(" %-10s %-12s %-10s %-7s %-5s\n", "Outlook", "Temperature", "Humidity", "Windy", "Play?");
printf("----------------------------------------\n");
for (int i = 0; i < total_samples; i++)
printf(" %-10s %-12s %-10s %-7s %-5s\n", data[i].Outlook, data[i].Temperature, data[i].Humidity, data[i].Windy, data[i].Play);
printf("\n================================================================================\n\n");
printf("2. 计算各属性的信息增益率与熵:\n");
printf("--------------------------------------------------------------------------------\n");
double total_entropy = entropy(data, total_samples);
printf("数据集总熵:%.4f\n\n", total_entropy);
for (int i = 0; i < attr_count; i++) {
double ig = info_gain(data, total_samples, attributes[i], 1);
double si = split_info(data, total_samples, attributes[i]);
double gr = (si == 0) ? 0 : ig / si;
printf("%s:\n", attributes[i]);
printf(" 信息增益 (Info Gain): %.4f\n", ig);
printf(" 分裂信息 (Split Info): %.4f\n", si);
printf(" 信息增益率 (Gain Ratio): %.4f\n\n", gr);
}
printf("================================================================================\n\n");
printf("3. 选择根节点分裂属性:\n");
printf("--------------------------------------------------------------------------------\n");
double best_gr = -1;
int best_index = -1;
for (int i = 0; i < attr_count; i++) {
double gr = gain_ratio(data, total_samples, attributes[i]);
if (gr > best_gr) {
best_gr = gr;
best_index = i;
}
}
printf("最佳分裂属性:%s\n信息增益率:%.4f\n", attributes[best_index], best_gr);
printf("\n================================================================================\n\n");
printf("4. 建立决策树:\n");
printf("--------------------------------------------------------------------------------\n");
Node *tree = build_tree(data, total_samples, attributes, attr_count);
printf("决策树结构:\n");
print_tree(tree, 0);
printf("================================================================================\n\n");
printf("5. 对新输入进行分类预测:\n");
printf("--------------------------------------------------------------------------------\n");
Sample tests[] = {
{"sunny", "cool", "high", "true", ""},
{"overcast", "hot", "normal", "false", ""},
{"rain", "mild", "high", "false", ""},
{"sunny", "mild", "normal", "false", ""},
{"rain", "cool", "normal", "true", ""}
};
int test_count = 5;
for (int i = 0; i < test_count; i++) {
const char *pred = predict(tree, &tests[i]);
printf("测试样本 %d:\n", i + 1);
printf(" 输入:Outlook=%-8s Temperature=%-6s Humidity=%-8s Windy=%-5s\n", tests[i].Outlook, tests[i].Temperature, tests[i].Humidity, tests[i].Windy);
printf(" 预测结果:%s\n\n", pred);
}
printf("================================================================================\n");
printf("训练集预测准确率:\n");
printf("--------------------------------------------------------------------------------\n");
int correct = 0;
for (int i = 0; i < total_samples; i++) {
const char *pred = predict(tree, &data[i]);
if (equals(pred, data[i].Play)) correct++;
}
double acc = (double)correct / total_samples * 100.0;
printf("准确率:%d/%d = %.2f%%\n", correct, total_samples, acc);
printf("================================================================================\n\n");
printf("6. 提取决策规则:\n");
printf("--------------------------------------------------------------------------------\n");
int rule_index = 1;
extract_rules(tree, "", &rule_index);
printf("================================================================================\n");
return 0;
}