natural language procesing in r

Comcast Consumer ComplaintsA first approach in R language

Olabanji [email protected]

mailto:[email protected]

Natural Language Processing (NLP)

NLP is a set of techniques for approaching text problems

Natural Language Processing (NLP)

A few questions • Word frequency • Variation across states

Comcast Consumer Complaints

• comcast_consumeraffairs_complaints.csv

• comcast_fcc_complaints_2015.csv

Raw complaint data about Comcast television and internet published at consumeraffairs.com between 04/08 and 09/16.

Raw complaints made to the FCC about Comcast between 04/15 and 06/15.

Preliminariesdf<-read.csv("comcast_consumeraffairs_complaints.csv")df_fcc<-read.csv("comcast_fcc_complaints_2015.csv")

dim(df)

##[1]56594

names(df)

##[1]"author""posted_on""rating""text"

dim(df_fcc)

##[1]222511

names(df_fcc)

##[1]"Ticket..""Customer.Complaint" ##[3]"Date""Time" ##[5]"Received.Via""City" ##[7]"State""Zip.code" ##[9]"Status""Filing.on.Behalf.of.Someone"

Comcast Consumer Affairs Complaints

ggplot(df)+geom_bar(aes(x=rating))

df%>%group_by(rating)%>%summarise(count=n())

###Atibble:6×2##ratingcount##<int><int>##101560##213734##32260##4354##5419##6532

df2<-df%>%filter(rating!=0)ggplot(df2)+geom_bar(aes(x=rating))

df3<-df2%>%mutate(State=str_sub(toupper(author),-2))df3%>%group_by(State)%>%summarise(Count=n())%>%arrange(desc(Count))

###Atibble:52×2##StateCount##<chr><int>##1FL650##2CA345##3GA320##4IL284##5PA221##6TN202##7TX193##8MI189##9WA168##10NJ167###...with42morerows

low_rating<-df2%>%filter(rating<3)high_rating<-df2%>%filter(rating>=3)

nrow(low_rating)

##[1]3994

nrow(high_rating)

##[1]105

#cs_ratio:customersatisfactionratiodf3%>%select(State,rating)%>%group_by(State)%>%summarise(cs_ratio=length(rating[rating>2])/length(rating))%>%arrange(desc(cs_ratio))

###Atibble:52×2##Statecs_ratio##<chr><dbl>##1IA1.00000000##2ID1.00000000##3BC0.50000000##4NV0.33333333##5WV0.13333333##6NH0.10714286##7MO0.09090909##8ER0.06666667##9SC0.05263158##10AZ0.05000000###...with42morerows

States with high customer satisfaction ratio (rating >2)

Word cloud for low ratings low_stops<-c('comcast',stopwords("english"))low_ratingCorpus<-Corpus(VectorSource(low_rating$text))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%tm_map(removeWords,low_stops)%>%tm_map(removeWords,stopwords("english"))%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(low_ratingCorpus,scale=c(5,0.5),max.words=100,random.order=FALSE,

rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))

Word cloud for high ratings temp_stops<-c('comcast',stopwords("english"))high_ratingCorpus<-Corpus(VectorSource(high_rating$text))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%#tm_map(removeWords,low_rating_stops)%>% tm_map(removeWords,temp_stops)%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(high_ratingCorpus,scale=c(5,0.5),max.words=100,

random.order=FALSE,rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))

Comcast Fcc Complaints (2015)

#Statevsnumberofcomplaintstemp1<-df_fcc%>%group_by(State)%>%summarise(no_complaints=n())%>%arrange(desc(no_complaints))#temp1ggplot(temp1,aes(x=State,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp1[["State"]])+theme(axis.text.x=element_text(angle=60,hjust=1))

#grabthefirst10rowstemp2<-temp1%>%slice(1:10)ggplot(temp2,aes(x=State,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp2[["State"]])+theme(axis.text.x=element_text(angle=60,hjust=1))

temp3<-df_fcc%>%group_by(City)%>%summarise(no_complaints=n())%>%arrange(desc(no_complaints))

###Atibble:928×2##Cityno_complaints##<fctr><int>##1Atlanta63##2Chicago47##3Knoxville36##4Houston33##5Jacksonville31##6Philadelphia25##7Denver22##8Miami22##9Nashville22##10Indianapolis21###...with918morerows

ggplot(temp3,aes(x=City,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp3[["City"]])+theme(axis.text.x=element_text(angle=60,hjust=1,size=1))

temp4<-temp3%>%slice(1:20)

###Atibble:20×2##Cityno_complaints##<fctr><int>##1Atlanta63##2Chicago47##3Knoxville36##4Houston33##5Jacksonville31##6Philadelphia25##7Denver22##8Miami22##9Nashville22##10Indianapolis21##11SanFrancisco20##12SanJose20##13Baltimore19##14Tucson19##15Washington19##16Marietta16##17Portland16##18Seattle14##19Memphis13##20Canton12

ggplot(temp4,aes(x=City,y=no_complaints))+geom_bar(stat="identity")+scale_x_discrete(limits=temp4[["City"]])+theme(axis.text.x=element_text(angle=60,hjust=1))

df_fcc%>%group_by(Customer.Complaint)%>%summarise(no_complaints=n())%>%arrange(desc(no_complaints))

###Atibble:1,842×2##Customer.Complaintno_complaints ##<fctr><int> ##1Comcast83 ##2ComcastInternet18 ##3ComcastDataCap17 ##4comcast13 ##5ComcastBilling11 ##6ComcastDataCaps11 ##7DataCaps11 ##8UnfairBillingPractices9 ##9Comcastdatacap8 ##10Comcastdatacaps8 ###...with1,832morerows

all_stops<-c(‘comcast','now','company','day','someone','thing','also',

'got','way','call','called','one','said','tell',stopwords("english"))df_fccCorpus<-Corpus(VectorSource(df_fcc$Customer.Complaint))%>%tm_map(removePunctuation)%>%tm_map(removeNumbers)%>%tm_map(tolower)%>%tm_map(removeWords,all_stops)%>%tm_map(stripWhitespace)%>%tm_map(PlainTextDocument)#tm_map(stemDocument)wordcloud(df_fccCorpus,scale=c(5,0.5),max.words=100,random.order=FALSE,

rot.per=0.35,use.r.layout=FALSE,colors=brewer.pal(8,"Dark2"))

https://www.kaggle.com/dan195/d/archaeocharlie/comcastcomplaints/first-run

Credits:

natural language procesing in r

Technology