SVM rtextools n-грамм : почему получена одинаковая точность для биграммы униграммы и триграммы

#r #matrix

#r #матрица

Вопрос:

Я хочу получить точность классификации SVM, используя n-грамм (униграмма, биграмма и триграмма). Однако точность, которую я вычисляю для трех разных n-граммов, аналогична. Я хочу обратиться за помощью о том, как я могу улучшить свой код, чтобы получить правильное точное чтение?

Прочитать набор данных

 df.data lt;- data.frame(  df_id = c (1:15),  df_content = c("Sleep difficulties are very common",  "Getting high quality sleep is key for good health",  "Sleeping well is just as important as exercise and eating wisely",  "two most common sleep issues - insomnia and obstructive sleep apnea",  "One in three people have problems sleeping at some point in their lives",   "Talk to your health care team to learn about alternatives to sleep medications, including remote insomnia treatment options",   "Effective treatments are available for both Insomnia and Sleep Apnea",  "Avoid using electronic devices in the bedroom",  "Avoid alcohol and caffeine before bedtime",  "Identify stressors amp; continue to manage stress",  "Always shower before you go to bed. It will make a lot of difference.",  "If you are eating meat and other kinds of meals, it is best to eat at least three to four hours before you go to bed so that the digestion is over",  "Just light one little lamp somewhere in the room where you sleep and you will see that these things will completely disappear",  "Sleep is a state where you are on the edge between the world of sounds and the world of silence, but you can only move into the world of silence when you are aware.",  "Improving sleep quality does not mean sleeping like a stone."),  stringsAsFactors = FALSE)  df.data  

получите мнение, основанное на AFINN

 library("syuzhet") df.data.sentiment lt;- get_sentiment(df.data$df_content, method="afinn") df.data.sentiment  convertto3score lt;- function (convert1) {  convert2 lt;- {}  z lt;- 1    while(z lt;= length(convert1)){  if (convert1[z] lt; 0){  convert2[z] = -1  } else {  if (convert1[z] gt; 0) {  convert2[z] = 1  } else {  convert2[z] = 0  }  }  z lt;- z 1  }  return(convert2) }  df.data.sentimentlt;- convertto3score(df.data.sentiment) df.data.sentiment  df.data$sentiment_score lt;- df.data.sentiment df.data  

фактор sentiment_score

 df.data$sentiment_score lt;- factor(df.data$sentiment_score, levels = c(-1, 0, 1)) df.data$sentiment_score  

перетасуйте строки во фрейме данных

 set.seed(1234)  df.data lt;- df.data[sample(nrow(df.data)),]  

create n-gram function —gt; from create_matrix() in RTextTools package

 create_matrix_unigram lt;- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {    stem_words lt;- function(x) {  split lt;- strsplit(x," ")  return(wordStem(unlist(split),language=language))  }    tokenize_ngrams lt;- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))    control lt;- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)    if (ngramLength gt; 1) {   control lt;- append(control,list(tokenize=tokenize_ngrams),after=7)  } else {  control lt;- append(control,list(tokenize=scan_tokenizer),after=4)  }    if (stemWords == TRUE) control lt;- append(control,list(stemming=stem_words),after=7)    trainingColumn lt;- apply(as.matrix(textColumns),1,paste,collapse=" ")  trainingColumn lt;- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")    corpus lt;- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))  matrix lt;- DocumentTermMatrix(corpus,control=control);  if (removeSparseTerms gt; 0) matrix lt;- removeSparseTerms(matrix,removeSparseTerms)    if (!is.null(originalMatrix)) {  terms lt;- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])    weight lt;- 0  if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight lt;- 0.000000001  amat lt;- matrix(weight,nrow=nrow(matrix),ncol=length(terms))  colnames(amat) lt;- terms  rownames(amat) lt;- rownames(matrix)    fixed lt;- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)  matrix lt;- fixed  }    matrix lt;- matrix[,sort(colnames(matrix))]    gc()  return(matrix) }  create_matrix_bigram lt;- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=2, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {    stem_words lt;- function(x) {  split lt;- strsplit(x," ")  return(wordStem(unlist(split),language=language))  }    tokenize_ngrams lt;- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))    control lt;- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)    if (ngramLength gt; 1) {   control lt;- append(control,list(tokenize=tokenize_ngrams),after=7)  } else {  control lt;- append(control,list(tokenize=scan_tokenizer),after=4)  }    if (stemWords == TRUE) control lt;- append(control,list(stemming=stem_words),after=7)    trainingColumn lt;- apply(as.matrix(textColumns),1,paste,collapse=" ")  trainingColumn lt;- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")    corpus lt;- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))  matrix lt;- DocumentTermMatrix(corpus,control=control);  if (removeSparseTerms gt; 0) matrix lt;- removeSparseTerms(matrix,removeSparseTerms)    if (!is.null(originalMatrix)) {  terms lt;- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])    weight lt;- 0  if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight lt;- 0.000000001  amat lt;- matrix(weight,nrow=nrow(matrix),ncol=length(terms))  colnames(amat) lt;- terms  rownames(amat) lt;- rownames(matrix)    fixed lt;- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)  matrix lt;- fixed  }    matrix lt;- matrix[,sort(colnames(matrix))]    gc()  return(matrix) }  create_matrix_trigram lt;- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=3, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {    stem_words lt;- function(x) {  split lt;- strsplit(x," ")  return(wordStem(unlist(split),language=language))  }    tokenize_ngrams lt;- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))    control lt;- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)    if (ngramLength gt; 1) {   control lt;- append(control,list(tokenize=tokenize_ngrams),after=7)  } else {  control lt;- append(control,list(tokenize=scan_tokenizer),after=4)  }    if (stemWords == TRUE) control lt;- append(control,list(stemming=stem_words),after=7)    trainingColumn lt;- apply(as.matrix(textColumns),1,paste,collapse=" ")  trainingColumn lt;- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")    corpus lt;- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))  matrix lt;- DocumentTermMatrix(corpus,control=control);  if (removeSparseTerms gt; 0) matrix lt;- removeSparseTerms(matrix,removeSparseTerms)    if (!is.null(originalMatrix)) {  terms lt;- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])    weight lt;- 0  if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight lt;- 0.000000001  amat lt;- matrix(weight,nrow=nrow(matrix),ncol=length(terms))  colnames(amat) lt;- terms  rownames(amat) lt;- rownames(matrix)    fixed lt;- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)  matrix lt;- fixed  }    matrix lt;- matrix[,sort(colnames(matrix))]    gc()  return(matrix) }  

model bag of word

 library("RTextTools") library("tm")  matrix = create_matrix_unigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 1)  #matrix = create_matrix_bigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 2)  #matrix = create_matrix_trigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 3)  

cross validation only for the first 7 rows

 containercv = create_container(matrix, df.data[, 3], trainSize = 1:7, virgin = FALSE)  cvres lt;- cross_validate(containercv, nfold=10, algorithm="SVM", seed=1234)  

final model fitting

 trainids lt;- seq(1, floor(nrow(df.data)*0.7)) testids lt;- seq(floor(nrow(df.data)*0.3) 1, nrow(df.data))  containerfinal = create_container(matrix, df.data[, 3], trainSize = trainids, virgin = FALSE)  models = train_models(containerfinal, algorithms = "SVM")  

evaluation

 texts lt;- df.data[, 2][testids] trueclass lt;- df.data[, 3][testids]  testmatrix = create_matrix_unigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 1, originalMatrix = matrix)  # testmatrix = create_matrix_bigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 2, originalMatrix = matrix)  # testmatrix = create_matrix_trigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 3, originalMatrix = matrix)  results = predict(models[[1]], testmatrix) table(trueclass, results)  # accuracy sum(trueclass==results)/length(results)  # unigram accuracy --gt; [1] 0.5454545 # bigram accuracy --gt; [1] 0.5454545 # trigram accuracy --gt; [1] 0.5454545