reticulate::use_python("'C:\\Users\\xhuangcb\\anaconda3\\envs\\spacy\\python.exe'") # your python location
LAC <- reticulate::import("LAC")
lac_seg <- LAC$LAC(mode = "seg")
lac_seg$load_customization("custom.txt") # optional
lac_analysis <- LAC$LAC(mode = "lac")
tokenizer <- \(string, analysis = FALSE, progress = TRUE, min = 1) {
if (progress == TRUE) {
bar <- list(
format = "Processing: {cli::pb_current} {cli::pb_bar} {cli::pb_percent} Rate: {cli::pb_rate} ETA: {cli::pb_eta}"
)
} else {
bar <- FALSE
}
if (analysis == FALSE) {
map(
string,
\(x) {
if (!is.na(nchar(x))) {
if (nchar(x) > 1) {
tokens <- lac_seg$run(x)
tokens <- tokens[nchar(tokens) > min]
return(tokens)
}
}
},
.progress = bar
)
} else {
map(
string,
\(x) {
if (!is.na(nchar(x))) {
if (nchar(x) > 1) {
tokens <- lac_analysis$run(x)
names(tokens[[1]]) <- tokens[[2]]
tokens[[1]] <- tokens[[1]][nchar(tokens[[1]]) > min]
return(tokens[[1]])
}
}
},
.progress = bar
)
}
}
df %>%
mutate(
words = str_remove_all(contents, "\\p{P}|\\s+") |> tokenizer(analysis = TRUE, min = 1)
) # how to call tokenizer in a data frame
lacR
R Package
LAC
A Chinese tokenizer based on Baidu LAC (Lexical Analysis of Chinese)
Overview
Compared to other Chinese word segmentation schemes, LAC performs rather well in entity information extraction, particularly for personal names and place names.
Note
This package will automatically create a conda or python virtual environment. This may encounter bugs in RStudio. If you do not want to install it, you can directly use the following code after using reticulate
.
Installation
remotes::install_github("xinzhuohkust/lacR")
Usage
setup
setup_lac(custom = FALSE, location = NULL) # not use custom dictionary
text segmentation
tokenizer(
string = "我和梅东在上海市中山北路与华东政法大学师生共度一个春节",
analysis = FALSE, # not to perform part-of-speech tagging
progress = TRUE, # display progress bar
min = 0 # keep all words
)
[[1]]
[1] "我" "和" "梅东" "在" "上海市"
[6] "中山北路" "与" "华东政法大学" "师生" "共度"
[11] "一个" "春节"