require_install <-function(libs) {for (i in libs){if( !is.element(i, .packages(all.available =TRUE)) ) {install.packages(i) }library(i,character.only =TRUE) }}require_install(libs=c('tm','SnowballC','tidytext','dplyr','wordcloud'))
Loading required package: NLP
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Loading required package: RColorBrewer
Corpus
doc1 <-"drugs, hospitals, doctors"doc2 <-"smog, pollution, micro-plastics, environment."doc3 <-"doctors, hospitals, healthcare"doc4 <-"pollution, environment, water."doc5 <-"I love NLP with deep learning."doc6 <-"I love machine learning."doc7 <-"He said he was keeping the wolf from the door."doc8 <-"Time flies like an arrow, fruit flies like a banana."doc9 <-"pollution, greenhouse gasses, GHG, hydrofluorocarbons, ozone hole, global warming. Montreal Protocol."doc10 <-"greenhouse gasses, hydrofluorocarbons, perfluorocarbons, sulfur hexafluoride, carbon dioxide, carbon monoxide, CO2, hydrofluorocarbons, methane, nitrous oxide."corpus <-c(doc1, doc2, doc3, doc4,doc5, doc6,doc7,doc8,doc9,doc10)tm_corpus <-Corpus(VectorSource(corpus))
1
concat docs into corpus var
2
created a corpus of class Corpus from the corpus var
Next, let’s inspect the corpus
inspect(tm_corpus)
3
inspect the corpus
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 10
[1] drugs, hospitals, doctors
[2] smog, pollution, micro-plastics, environment.
[3] doctors, hospitals, healthcare
[4] pollution, environment, water.
[5] I love NLP with deep learning.
[6] I love machine learning.
[7] He said he was keeping the wolf from the door.
[8] Time flies like an arrow, fruit flies like a banana.
[9] pollution, greenhouse gasses, GHG, hydrofluorocarbons, ozone hole, global warming. Montreal Protocol.
[10] greenhouse gasses, hydrofluorocarbons, perfluorocarbons, sulfur hexafluoride, carbon dioxide, carbon monoxide, CO2, hydrofluorocarbons, methane, nitrous oxide.
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 10
[1] drugs, hospitals, doctors
[2] smog, pollution, micro-plastics, environment.
[3] doctors, hospitals, healthcare
[4] pollution, environment, water.
[5] i love nlp with deep learning.
[6] i love machine learning.
[7] he said he was keeping the wolf from the door.
[8] time flies like an arrow, fruit flies like a banana.
[9] pollution, greenhouse gasses, ghg, hydrofluorocarbons, ozone hole, global warming. montreal protocol.
[10] greenhouse gasses, hydrofluorocarbons, perfluorocarbons, sulfur hexafluoride, carbon dioxide, carbon monoxide, co2, hydrofluorocarbons, methane, nitrous oxide.
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 10
[1] drugs hospitals doctors
[2] smog pollution microplastics environment
[3] doctors hospitals healthcare
[4] pollution environment water
[5] i love nlp with deep learning
[6] i love machine learning
[7] he said he was keeping the wolf from the door
[8] time flies like an arrow fruit flies like a banana
[9] pollution greenhouse gasses ghg hydrofluorocarbons ozone hole global warming montreal protocol
[10] greenhouse gasses hydrofluorocarbons perfluorocarbons sulfur hexafluoride carbon dioxide carbon monoxide co2 hydrofluorocarbons methane nitrous oxide
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 10
[1] drug hospit doctor
[2] smog pollut microplast environ
[3] doctor hospit healthcar
[4] pollut environ water
[5] love nlp deep learn
[6] love machin learn
[7] said keep wolf door
[8] time fli like arrow fruit fli like banana
[9] pollut greenhous gass ghg hydrofluorocarbon ozon hole global warm montreal protocol
[10] greenhous gass hydrofluorocarbon perfluorocarbon sulfur hexafluorid carbon dioxid carbon monoxid co hydrofluorocarbon methan nitrous oxid
Removing Whitespaces - a single white space or group of whitespaces may be considered to be a token within a corpus. This is how we remove these token
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 10
[1] drug hospit doctor
[2] smog pollut microplast environ
[3] doctor hospit healthcar
[4] pollut environ water
[5] love nlp deep learn
[6] love machin learn
[7] said keep wolf door
[8] time fli like arrow fruit fli like banana
[9] pollut greenhous gass ghg hydrofluorocarbon ozon hole global warm montreal protocol
[10] greenhous gass hydrofluorocarbon perfluorocarbon sulfur hexafluorid carbon dioxid carbon monoxid co hydrofluorocarbon methan nitrous oxid