091. 编写代码实现简单的自然语言处理算法

在C语言中实现一个简单的自然语言处理（NLP）算法可以是一个很好的练习，帮助你理解文本处理的基本概念。这里我将展示一个简单的文本分词（Tokenization） 算法的实现。分词是自然语言处理中的一个基本步骤，目的是将文本分割成单词、短语或符号等有意义的单元。

简单的文本分词算法

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_TOKENS 100
#define MAX_TOKEN_LENGTH 50

// 判断是否是分隔符
int isDelimiter(char c) {
    return isspace(c) || ispunct(c);
}

// 分词函数
void tokenize(const char* text, char tokens[MAX_TOKENS][MAX_TOKEN_LENGTH]) {
    int tokenCount = 0;
    int tokenLength = 0;
    int i = 0;

    while (text[i] != '\0' && tokenCount < MAX_TOKENS) {
        // 跳过分隔符
        while (isDelimiter(text[i])) {
            i++;
        }

        // 读取单词
        tokenLength = 0;
        while (!isDelimiter(text[i]) && text[i] != '\0') {
            if (tokenLength < MAX_TOKEN_LENGTH - 1) {
                tokens[tokenCount][tokenLength++] = text[i];
            }
            i++;
        }

        // 添加字符串结束符
        tokens[tokenCount][tokenLength] = '\0';

        // 如果单词不为空，则增加tokenCount
        if (tokenLength > 0) {
            tokenCount++;
        }
    }
}

int main() {
    char text[1000];
    char tokens[MAX_TOKENS][MAX_TOKEN_LENGTH];

    printf("Enter text: ");
    fgets(text, sizeof(text), stdin);

    // 去掉换行符
    text[strcspn(text, "\n")] = '\0';

    // 分词
    tokenize(text, tokens);

    // 打印分词结果
    printf("Tokens:\n");
    for (int i = 0; i < MAX_TOKENS && tokens[i][0] != '\0'; i++) {
        printf("%s\n", tokens[i]);
    }

    return 0;
}

代码说明

输入文本：

使用 fgets 从用户输入中读取文本。
去掉输入字符串末尾的换行符。

分隔符判断：

使用 isDelimiter 函数判断字符是否是分隔符（空格、标点符号等）。

分词逻辑：

遍历输入文本，跳过分隔符，读取单词。
将单词存储到二维字符数组 tokens 中。

打印结果：

遍历 tokens 数组，打印所有分词结果。

示例运行

假设用户输入以下文本：Enter text: Hello, world! This is a simple NLP example.

程序输出：

Tokens:
Hello
world
This
is
a
simple
NLP
example

扩展功能

支持更多分隔符：可以扩展 isDelimiter 函数，支持更多类型的分隔符，例如特殊字符。
大小写转换：在分词过程中，可以将所有单词转换为小写，以便进行不区分大小写的处理。
停用词过滤：可以添加一个停用词列表，过滤掉常见的无意义单词（如“the”、“is”等）。
词干提取（Stemming）：实现一个简单的词干提取算法，将单词还原到其基本形式。

大小写转换和停用词过滤

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_TOKENS 100
#define MAX_TOKEN_LENGTH 50
#define MAX_STOP_WORDS 10

char stopWords[MAX_STOP_WORDS][MAX_TOKEN_LENGTH] = {
    "the", "is", "a", "an", "and", "in", "on", "at", "to", "of"
};

// 判断是否是分隔符
int isDelimiter(char c) {
    return isspace(c) || ispunct(c);
}

// 判断是否是停用词
int isStopWord(const char* word) {
    for (int i = 0; i < MAX_STOP_WORDS; i++) {
        if (strcmp(word, stopWords[i]) == 0) {
            return 1;
        }
    }
    return 0;
}

// 分词函数
void tokenize(const char* text, char tokens[MAX_TOKENS][MAX_TOKEN_LENGTH]) {
    int tokenCount = 0;
    int tokenLength = 0;
    int i = 0;

    while (text[i] != '\0' && tokenCount < MAX_TOKENS) {
        // 跳过分隔符
        while (isDelimiter(text[i])) {
            i++;
        }

        // 读取单词
        tokenLength = 0;
        while (!isDelimiter(text[i]) && text[i] != '\0') {
            if (tokenLength < MAX_TOKEN_LENGTH - 1) {
                tokens[tokenCount][tokenLength++] = tolower(text[i]); // 转换为小写
            }
            i++;
        }

        // 添加字符串结束符
        tokens[tokenCount][tokenLength] = '\0';

        // 如果单词不为空且不是停用词，则增加tokenCount
        if (tokenLength > 0 && !isStopWord(tokens[tokenCount])) {
            tokenCount++;
        }
    }
}

int main() {
    char text[1000];
    char tokens[MAX_TOKENS][MAX_TOKEN_LENGTH];

    printf("Enter text: ");
    fgets(text, sizeof(text), stdin);

    // 去掉换行符
    text[strcspn(text, "\n")] = '\0';

    // 分词
    tokenize(text, tokens);

    // 打印分词结果
    printf("Tokens:\n");
    for (int i = 0; i < MAX_TOKENS && tokens[i][0] != '\0'; i++) {
        printf("%s\n", tokens[i]);
    }

    return 0;
}

示例运行

假设用户输入以下文本：Enter text: The quick brown fox jumps over the lazy dog.

程序输出：

Tokens:
quick
brown
fox
jumps
over
lazy
dog

视频讲解

BiliBili： 视睿网络-哔哩哔哩视频 (bilibili.com)