natural language procesing in r
TRANSCRIPT
Comcast Consumer ComplaintsA first approach in R language
Olabanji [email protected]
Natural Language Processing (NLP)
NLP is a set of techniques for approaching text problems
Natural Language Processing (NLP)
A few questions • Word frequency • Variation across states
Comcast Consumer Complaints
• comcast_consumeraffairs_complaints.csv
• comcast_fcc_complaints_2015.csv
Raw complaint data about Comcast television and internet published at consumeraffairs.com between 04/08 and 09/16.
Raw complaints made to the FCC about Comcast between 04/15 and 06/15.
Preliminariesdf<-read.csv("comcast_consumeraffairs_complaints.csv")df_fcc<-read.csv("comcast_fcc_complaints_2015.csv")
dim(df)
##[1]56594
names(df)
##[1]"author""posted_on""rating""text"
dim(df_fcc)
##[1]222511
names(df_fcc)
##[1]"Ticket..""Customer.Complaint" ##[3]"Date""Time" ##[5]"Received.Via""City" ##[7]"State""Zip.code" ##[9]"Status""Filing.on.Behalf.of.Someone"
Comcast Consumer Affairs Complaints
ggplot(df)+geom_bar(aes(x=rating))
df%>%group_by(rating)%>%summarise(count=n())
###Atibble:6×2##ratingcount##<int><int>##101560##213734##32260##4354##5419##6532
df2<-df%>%filter(rating!=0)ggplot(df2)+geom_bar(aes(x=rating))
df3<-df2%>%mutate(State=str_sub(toupper(author),-2))df3%>%group_by(State)%>%summarise(Count=n())%>%arrange(desc(Count))
###Atibble:52×2##StateCount##<chr><int>##1FL650##2CA345##3GA320##4IL284##5PA221##6TN202##7TX193##8MI189##9WA168##10NJ167###...with42morerows
low_rating<-df2%>%filter(rating<3)high_rating<-df2%>%filter(rating>=3)
nrow(low_rating)
##[1]3994
nrow(high_rating)
##[1]105
#cs_ratio:customersatisfactionratiodf3%>%select(State,rating)%>%group_by(State)%>%summarise(cs_ratio=length(rating[rating>2])/length(rating))%>%arrange(desc(cs_ratio))
###Atibble:52×2##Statecs_ratio##<chr><dbl>##1IA1.00000000##2ID1.00000000##3BC0.50000000##4NV0.33333333##5WV0.13333333##6NH0.10714286##7MO0.09090909##8ER0.06666667##9SC0.05263158##10AZ0.05000000###...with42morerows
States with high customer satisfaction ratio (rating >2)
Word cloud for low ratings low_stops<-c('comcast',stopwords("english"))low_ratingCorpus<-Corpus(VectorSource(low_rating$text))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%tm_map(removeWords,low_stops)%>%tm_map(removeWords,stopwords("english"))%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(low_ratingCorpus,scale=c(5,0.5),max.words=100,random.order=FALSE,
rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))
Word cloud for low ratings low_stops<-c('comcast',stopwords("english"))low_ratingCorpus<-Corpus(VectorSource(low_rating$text))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%tm_map(removeWords,low_stops)%>%tm_map(removeWords,stopwords("english"))%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(low_ratingCorpus,scale=c(5,0.5),max.words=100,random.order=FALSE,
rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))
Word cloud for high ratings temp_stops<-c('comcast',stopwords("english"))high_ratingCorpus<-Corpus(VectorSource(high_rating$text))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%#tm_map(removeWords,low_rating_stops)%>% tm_map(removeWords,temp_stops)%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(high_ratingCorpus,scale=c(5,0.5),max.words=100,
random.order=FALSE,rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))
Comcast Fcc Complaints (2015)
#Statevsnumberofcomplaintstemp1<-df_fcc%>%group_by(State)%>%summarise(no_complaints=n())%>%arrange(desc(no_complaints))#temp1ggplot(temp1,aes(x=State,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp1[["State"]])+theme(axis.text.x=element_text(angle=60,hjust=1))
#grabthefirst10rowstemp2<-temp1%>%slice(1:10)ggplot(temp2,aes(x=State,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp2[["State"]])+theme(axis.text.x=element_text(angle=60,hjust=1))
temp3<-df_fcc%>%group_by(City)%>%summarise(no_complaints=n())%>%arrange(desc(no_complaints))
###Atibble:928×2##Cityno_complaints##<fctr><int>##1Atlanta63##2Chicago47##3Knoxville36##4Houston33##5Jacksonville31##6Philadelphia25##7Denver22##8Miami22##9Nashville22##10Indianapolis21###...with918morerows
ggplot(temp3,aes(x=City,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp3[["City"]])+theme(axis.text.x=element_text(angle=60,hjust=1,size=1))
temp4<-temp3%>%slice(1:20)
###Atibble:20×2##Cityno_complaints##<fctr><int>##1Atlanta63##2Chicago47##3Knoxville36##4Houston33##5Jacksonville31##6Philadelphia25##7Denver22##8Miami22##9Nashville22##10Indianapolis21##11SanFrancisco20##12SanJose20##13Baltimore19##14Tucson19##15Washington19##16Marietta16##17Portland16##18Seattle14##19Memphis13##20Canton12
ggplot(temp4,aes(x=City,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp4[["City"]])+theme(axis.text.x=element_text(angle=60,hjust=1))
df_fcc%>%group_by(Customer.Complaint)%>%summarise(no_complaints=n())%>%arrange(desc(no_complaints))
###Atibble:1,842×2##Customer.Complaintno_complaints ##<fctr><int> ##1Comcast83 ##2ComcastInternet18 ##3ComcastDataCap17 ##4comcast13 ##5ComcastBilling11 ##6ComcastDataCaps11 ##7DataCaps11 ##8UnfairBillingPractices9 ##9Comcastdatacap8 ##10Comcastdatacaps8 ###...with1,832morerows
all_stops<-c(‘comcast','now','company','day','someone','thing','also',
'got','way','call','called','one','said','tell',stopwords("english"))df_fccCorpus<-Corpus(VectorSource(df_fcc$Customer.Complaint))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%tm_map(removeWords,all_stops)%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(df_fccCorpus,scale=c(5,0.5),max.words=100,random.order=FALSE,
rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))
https://www.kaggle.com/dan195/d/archaeocharlie/comcastcomplaints/first-run
Credits: