lacR

R Package
LAC
A Chinese tokenizer based on Baidu LAC (Lexical Analysis of Chinese)
Author
Affiliation

Xinzhuo Huang

HKUST SOSC

Published

October 20, 2023

Modified

February 16, 2024

Overview

Compared to other Chinese word segmentation schemes, LAC performs rather well in entity information extraction, particularly for personal names and place names.



Note

This package will automatically create a conda or python virtual environment. This may encounter bugs in RStudio. If you do not want to install it, you can directly use the following code after using reticulate.

reticulate::use_python("'C:\\Users\\xhuangcb\\anaconda3\\envs\\spacy\\python.exe'") # your python location

LAC <- reticulate::import("LAC")

lac_seg <- LAC$LAC(mode = "seg")

lac_seg$load_customization("custom.txt") # optional

lac_analysis <- LAC$LAC(mode = "lac")

tokenizer <- \(string, analysis = FALSE, progress = TRUE, min = 1) {
    if (progress == TRUE) {
        bar <- list(
            format = "Processing: {cli::pb_current}  {cli::pb_bar} {cli::pb_percent}  Rate: {cli::pb_rate}  ETA: {cli::pb_eta}"
        )
    } else {
        bar <- FALSE
    }

    if (analysis == FALSE) {
        map(
            string,
            \(x) {
                if (!is.na(nchar(x))) {
                    if (nchar(x) > 1) {
                        tokens <- lac_seg$run(x)
                        tokens <- tokens[nchar(tokens) > min]
                        return(tokens)
                    }
                }
            },
            .progress = bar
        )
    } else {
        map(
            string,
            \(x) {
                if (!is.na(nchar(x))) {
                    if (nchar(x) > 1) {
                        tokens <- lac_analysis$run(x)
                        names(tokens[[1]]) <- tokens[[2]]
                        tokens[[1]] <- tokens[[1]][nchar(tokens[[1]]) > min]
                        return(tokens[[1]])
                    }
                }
            },
            .progress = bar
        )
    }
}

df %>%
    mutate(
        words = str_remove_all(contents, "\\p{P}|\\s+") |> tokenizer(analysis = TRUE, min = 1)
    ) # how to call tokenizer in a data frame

Installation

remotes::install_github("xinzhuohkust/lacR")

Usage

setup

setup_lac(custom = FALSE, location = NULL) # not use custom dictionary

text segmentation

tokenizer(
  string = "我和梅东在上海市中山北路与华东政法大学师生共度一个春节",
  analysis = FALSE, # not to perform part-of-speech tagging
  progress = TRUE, # display progress bar
  min = 0 # keep all words
)
[[1]]
 [1] "我"           "和"           "梅东"         "在"           "上海市"      
 [6] "中山北路"     "与"           "华东政法大学" "师生"         "共度"        
[11] "一个"         "春节"        

Rcpp version (coming soon)