###########################################
#                                         #
#      R reference for wikispiral         #
#                                         #
###########################################


# This document is a working document to reference and comment all R functions tested or used on wikispiral.org

# To download a file and include it in a R variable
#download.file(url="http://wikispiral.org/temp_R/stats_4_R.txt", destfile="stats_4_R.txt")

# To include in a R variable a local file
#file.path <- "/home/joel/Bureau/testR/stats_4_R.txt"

# To check the working directory
getwd()

# To gather info about your session
sessionInfo()
iconv()

# To set the working directory like the one on the webserver,
# create a "export_stats_files" directory, then a "tempR" in the same
# directory (like "E:/export_stats_files" and "E:/tempR")
# Then in the menu tools > options > general, set up the temp folder
# as E:/tempR
# Then the commands executed will be the eaxct same than on the
# website, copy and paste should be enough :)
setwd("../tempR") 

# To 
s4r <- read.delim(quote="", file="../export_stats_files/stats_4_R.txt", header=T, sep="|", skip="1")

####### Testing your data #######
# Show the beginning of the data
head(s4r)

# Show the end of the data
tail(s4r)

# Show the current encoding of your text data
Sys.setlocale("LC_ALL", "French")

# Show the beginning of the data
library(stringr)



colnames(s4r)

# SELECT
# gccountry.value AS country,
# gcterritory.value AS territory,
# crit.value AS critere, # What an individual said as answer of criteria for well being, etc.
# gc.value AS GC, 
# ghlabel.value AS GH,########
# General tests with cited packages
#library(rcmdr)
#install.packages("Rcmdr")

###########################################
# Quick test with Spiral data
# Remove everything from memory
# rm (list = ls() ) (to clean the workspace to run stuff anew)
# q.value AS question,
# nbexpress.value AS nbexpress,
# ind.value AS IND, # Indicators  (A01, A02, A03...), sublevels of Dimensions (A, B, C, D, E, F, G, H, I)
# cat.value AS CAT, # (O) Obtention, (P) Possibility, (Q) Quality, (S) Sustainability
# proposition.value AS proposition
# --- (new columns to be added for Tue March 26th, 2013, by Jyhem) --------------
# Gender (Sex)
# Age Group (ranges): 
## Children: Until 6 (children notin school), 6-12, 12-18 (teenagers) 
## Adults: 18-25 (Young people), 25-40, 40-60 
## Senior: 60-80 (Third Age), >80 (Fourth)
## (undefined)
# Maximum and minimum age
# Type of Homogeneous Group.
# Beneficiaries
# Full Object Code. 3rd level of categorization below: Dimensions (A) > Indicators (A01) > Object Code (A01-02)
# Full Meaning Code. 3d table. P16: P (Possibility), 1 (something), 6 (something)

# In case if the colnames are not defined
#colnames(s4r) <- c("gccountry", "gcterritory", "crit", "gc", "ghlabel", 
#              "q", "nbexpress", "ind", "cat", "proposition")


head(s4r)
class(s4r)
summary(s4r)
str(s4r)
str

# Criteria
#str_split(head(s4r$ind), "0")

# load the library for str_extract ("Extract first piece of a string that matches a pattern.")
library(stringr)


#Check the Indicators columns
s4r$IND
s4r$DIM

# obtain the dimension letter
# s4r.dim.l <- str_extract(s4r$ind, "^.") # get the first letter at the beginning of the string
# remove NA?

# Check case
levels(s4r$IND)

# Check case with an upper modification
levels(factor(toupper(s4r$IND)))
levels(factor(toupper(s4r$IND)))

#Check the characteristic of the vector
class(toupper(s4r$IND))

# Make proper case directly in the data
s4r$DIM <- as.factor(as.character(toupper(s4r$DIM)))

# Check it
levels(s4r$IND)

# Create a vector with the first letter of the indicators to get the dimension
#s4r.dim.l <- str_extract(s4r.dim.l, "^.")

# join the dimension data as a new columns
#s4r <- cbind(s4r, s4r.dim.l)
#str(s4r)
#colnames(s4r) <- c("gccountry", "gcterritory", "crit", "gc", "ghlabel", 
#                  "q", "nbexpress", "ind", "cat", "proposition", "?", "dim") # Add "dim" (from dimension as column name)
#head(s4r) questions str(s4r$q)

# We see that there are some errors coded here (h vs. H; plus N, W):
#levels(s4r$dim)
# This doesn't work because the number of levels in the current state and the new ones (less number)
#levels(s4r$dim) <- levels(factor(toupper(s4r$dim)))
# So that it needs to be done this other way:
#s4r$dim <- as.factor(as.character(toupper(s4r$dim)))

# Check the dimensions for the chart
levels(s4r$DIM)

# To know which kind of plots we have
method(plots)

# Make a pie chart with the dimensions
pie.dim <- table(s4r$DIM)
pie(pie.dim)

# Make a pie chart with the meaning code
pie.cat <- table(s4r$CAT)
pie(pie.cat)

# Same with only the two first questions
pie.quest <- s4r$question[ s4r$question=="ACT" | s4r$question=="IB" | s4r$question=="WB" ]
pie.dim <- table(pie.quest)
pie(pie.dim)

levels(s4r$question)

# Make a pie chart with the dimensions for a coordination group

# Check an operation with the indexes ()
s4r.cg.idx <- which(s4r$CG=="Charleroi")
length(s4r.cg.idx)
check <- s4r[s4r.cg.idx,]
head(check)
length(check)


x <- s4r$DIM[ s4r$CG=="Charleroi" | s4r$CAT=="S--"]
length(x)
pie.dim <- table(x)
pie(pie.dim)


# FIGURE: Number of criteria in questions
# -----------------------------------------
# Pie Chart from data frame with Appended Sample Sizes
# from http://www.statmethods.net/graphs/pie.html
my.table <- table(s4r$q)[-c(5,6)]
#table(s4r$q)[s4r$q=="Q--"]
my.table
lbls <- paste(names(my.table), "\n", my.table, sep="")
pie(my.table, labels = lbls, 
    main="Pie Chart of Questions\n (without error codes)")


# Example using googleVis package (Google Visualization API)
# ------------------------------------------------------------
# From https://code.google.com/p/google-motion-charts-with-r/wiki/GadgetExamples#Pie_Chart
library(googleVis)
## Pie chart
df.my.table<-cbind(df.my.table, c("NA", rownames(df.my.table)[-1]))
rownames(df.my.table)<- 1:dim(df.my.table)[1]
colnames(df.my.table)<- c("Counts", "Questions")
df.my.table
#length(df.my.table)
#class(df.my.table)
#class(CityPopularity)
Pie <- gvisPieChart(df.my.table,
                    options=list(width=400, height=200))
plot(Pie)
# It doesn't seeem to work locally in my computer [Xavier]
cat(createGoogleGadget(Pie), file="piechart.xml")
######################################################


# Number of criteria in dimensions
####################################################
# Pie Chart
# as of in http://www.statmethods.net/graphs/pie.html
my.pie.dim <- table(s4r$dim)
#table(s4r$q)[s4r$q=="Q--"]
my.table <- my.pie.dim
lbls <- paste(names(my.table), "\n", my.table, sep="")
pie(my.table, labels = lbls, radius = 1.0, as.data.frame(my.table)
    main="Number of Criteria in Dimensions\n (with error codes: N, W, NA)")
#?pie

# Equivalent pie chart with GoogleVis API
# ----------------------------------------
library(googleVis)
df.my.table <- as.data.frame(my.table)
Pie <- gvisPieChart(df.my.table,
                    options=list(width=400, height=200))
plot(Pie)
# It doesn't seeem to work locally in my computer [Xavier]
cat(createGoogleGadget(Pie), file="piechart.xml")



# Number of criteria in Dimensions and groups
##############################################
# Stacked Bar Plot with Colors and Legend
# From http://www.statmethods.net/graphs/bar.html 
counts <- table(s4r$dim[1:10], s4r$cri[1:10])
str(counts)
barplot(counts, main="Number of criteria in Dimensions and groups",
        xlab="Dim", col=c("darkblue","red", "green", "yellow", "brown", "orange", "pink", "cyan", "grey", "black", "coral"),
        legend = rownames(counts))


# Using ggplot2
# ---------------
# Example 1. from http://stackoverflow.com/questions/2619069/r-programming-creating-a-stacked-bar-graph-with-variable-colors-for-each-stack
library(ggplot2)
#data(mpg)     # data set provided w/ ggplot2
counts <- table(s4r$dim[1:10], s4r$cri[1:10])
px = ggplot(counts, aes(x=class, fill=counts)) + geom_bar() 
print(px)


# Using ggplot2 & Reshape
# -----------------------
# "Reshape" is an R package to produce something like the fancy "Pivot Tables" in Spreadsheet programs
# Its main functions to master are "cast" and "melt". Google for them for other simple examples on how to use.
# 
# Example 2. From http://stackoverflow.com/questions/7583432/plot-stacked-bar-plot-in-r
##### THESE COMMANDS BELOW NEEDS TO BE UPDATED STILL TO USE SPIRAL'S DATA ###
library(ggplot2)
library(reshape2)

x <- data.frame(
  Period = c(1,1,2,2,3,3,4,4),
  Sample = c("A","B","A","B","A","B","A","B"),
  Value1 = c(3,2,6,7,3,2,1,2),
  Value2 = c(1,0,5,2,2,0,2,5)
)

mx <- melt(x, id.vars=1:2) # This is the function from the reshape package
ggplot(mx, aes(x=Period, y=value, fill=variable)) + 
  geom_bar(stat="identity") + 
  facet_grid(~Sample)

# Using chart.StackedBar (scaled to 100%) from PerformanceAnalytics package
# -------------------------------------------------------------------------
# Taken from: http://braverock.com/brian/R/PerformanceAnalytics/html/chart.StackedBar.html
# * This function is a wrapper for barplot but adds three additional capabilities. First, it calculates and sets a bottom margin for long column names that are rotated vertically. That doesn't always result in the prettiest chart, but it does ensure readable labels.
# * Second, it places a legend "under" the graph rather than within the bounds of the chart (which would obscure the data). The legend is created from the column names. The default is to create the legend when there's more than one row of data being presented. If there is one row of data, the chart may be "unstacked" and the legend removed.
# * Third, it plots or stacks negative values from an origin of zero, similar to the behavior of barchart from the 'lattice' package.

# Install it if needed
if(!require(PerformanceAnalytics)){
  install.packages("PerformanceAnalytics", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
}

##### THESE COMMANDS BELOW NEEDS TO BE UPDATED STILL TO USE SPIRAL'S DATA ###
library(PerformanceAnalytics)
data(weights)
head(weights)

# With the legend "under" the chart
chart.StackedBar(weights, date.format="%Y", cex.legend = 0.7, colorset=rainbow12equal)

# Without the legend
chart.StackedBar(weights, colorset=rainbow12equal, legend.loc=NULL)

# for one row of data, use 'unstacked' for a better chart
chart.StackedBar(weights[1,,drop=FALSE], unstacked=TRUE, las=3) 


# Examples of Stacked bar charts (not scaled to 100%) using ggplot2
# ------------------------------------------------------------------
# as shown here: http://rpubs.com/nikedenise/3256
library(ggplot2)
head(s4r$)




# -----------------------------------------------

#######################################################

# Answers specific to this other Questions
s4r.wb <- s4r[s4r$q=="WB",]
s4r.ib <- s4r[s4r$q=="IB",]
s4r.act <- s4r[s4r$q=="ACT",]
s4r.fg <- s4r[s4r$q=="FG",]
# Error coding answers, maybe?
s4r.q <- s4r[s4r$q=="Q--",]
s4r.t <- s4r[s4r$q=="Timisoara",]

#################################################
# Text mining
#################################################
# Includes a word cloud  with the description of the packages from http://mods.tiki.org
# Example adapted from the ones shown here: http://www.r-bloggers.com/word-cloud-in-r/
# If required the first time, uncomment the following 4 lines to install the required R packages, and edit the path and url to suite your needs.
#install.packages("XML", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
#install.packages("tm", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
#install.packages("wordcloud", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
#install.packages("RColorBrewer", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
if(!require(XML)){
  install.packages("XML", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
  require(XML)
} 
if(!require(tm)){
  install.packages("tm", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
  require(tm)
} 
if(!require(googleVis)){
  install.packages("googleVis", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
  require(googleVis)
} 
if(!require(wordcloud)){
  install.packages("wordcloud", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
  require(wordcloud)
} 
if(!require(RColorBrewer)){
  install.packages("RColorBrewer", repos="http://ftp.heanet.ie/mirrors/cran.r-project.org/")
  require(RColorBrewer)
} 

#u2 = "http://mods.tiki.org/"
#t2 = readHTMLTable(u2)

t.wb <- s4r.wb$cri
#head(t.wb)
ap.corpus <- Corpus(DataframeSource(data.frame(as.character(t.wb))))
#head(ap.corpus)
#str(ap.corpus)
#class(t.wb)
#as.character(t.wb)
ap.corpus <- tm_map(ap.corpus, removePunctuation)
ap.corpus <- tm_map(ap.corpus, tolower)
ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
ap.tdm <- TermDocumentMatrix(ap.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
#table(ap.d$freq)
pal <- brewer.pal(8,"Dark2")
#png("wordcloud_packages.png", width=1280,height=800)
wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,
          max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal)
#dev.off()

class(t2[[1]][6])
ap.corpus2 <- Corpus(DataframeSource(data.frame(t2[[1]][6])))
head(ap.corpus2)
str(ap.corpus2)

ap.corpus2 <- tm_map(ap.corpus2, removePunctuation)
ap.corpus2 <- tm_map(ap.corpus2, tolower)
ap.corpus2 <- tm_map(ap.corpus2, function(x) removeWords(x, stopwords("english")))
ap.tdm2 <- TermDocumentMatrix(ap.corpus2)
ap.m2 <- as.matrix(ap.tdm2)
ap.v2 <- sort(rowSums(ap.m2),decreasing=TRUE)
ap.d2 <- data.frame(word = names(ap.v2),freq=ap.v2)
#table(ap.d2$freq)
pal2 <- brewer.pal(8,"Dark2")
#png("wordcloud_packages.png", width=1280,height=800)
wordcloud(ap.d2$word,ap.d2$freq, scale=c(8,.2),min.freq=3,
          max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
#dev.off()

############## 
# Testing Shiny
# ----------------
install.packages('shiny')
library(shiny)
#runExample("01_hello")

# Define UI for application that plots random distributions 
shinyUI(pageWithSidebar(
  
  # Application title
  headerPanel("Hello Shiny!"),
  
  # Sidebar with a slider input for number of observations
  sidebarPanel(
    sliderInput("obs", 
                "Number of observations:", 
                min = 0, 
                max = 1000, 
                value = 500)
  ),
  
  # Show a plot of the generated distribution
  mainPanel(
    plotOutput("distPlot")
  )
))



############ R meeting for wikispiral archive ########################

######################################################################
# rcoe_00.r = spiral.r
# * Testing some R commands for Wikispiral - http://wikispiral.org
# * Notes by Xavier de Pedro Puente - xavier.depedro@vhir.org
#     VHIR-UEB - http://ueb.vhir.org
#
# Available online at:https://dl.dropbox.com/u/1293126/spiral.r
# 
# Under the license: cc-by-sa 3.0
# http://creativecommons.org/licenses/by-sa/3.0/
#######################################################################

# Notes from Monday March 25th:
# --------------------------------
# Morning: 
# * Intro to Spiral, Tiki, PluginR, Trackers, R, GUI's for R, R packaged, etc.
#
# Afternoon (after lunch):
# -------------------------
# Samuel Thirion:
# GOAL: Feed data (& statistics) for politicians (policy makers) in CoE.
#
# Issues to be solved:
# * homogenous groups needs to be identified (common patterns: Men/Women/Unknown; disabled,  )
#   ## Main demand from policies: data for specific groups
#   ## Data by team: policies of education, policies of urbanism, etc. We have to be able to know what people say. 
#   ## Mine the data that will affect the Unenploymnet policies.
#   ## Differences between towns. How Grenoble is different from the rest of France, of from other countries, etc.
#   ## We need analysis from the point of view of the dimensions of weel being: you can be limited because of posibility criteria: 
#   ## young people can not find jobs because of lacking training, not finding the best , ...
# Statistics by words, done by collaborators in Wallonie (Belgique): Child (children), ...
#
# Types of HOMOGENOUS GROUPS (HG):
## Sex: men, women, unknown/unreported
## Age: child, teeneagers, adults, seniors
## Others ("Type of Population Group"): homeless, prisioners, farmers, migrants, doctors, wealthy, autists, Civil Servants,...., undefined, others.
#
# Two types of information and results, depending on the target population of the results:
## For local people
### Espoir website.
## For policy makers
### (Nothing, as of today). "The most urgent need for Spiral as of Today, is to prepare results for politicians/policty makers"
#
# Other needs for these two days:
## (1) Repartition of criteria (answers by individuals) by dimensions (9): What did they speak more about? 
### each type of population group
### for each question
## (2) Qualitative information using the codes
### Signification code: Possibility, Quality (the 2 most importanty to Samuel Thirion)
### Object code: for some database only (like in Cape Verd, or Gabon, because they speak very concretely about object codes, very specific things)
## (3) "Research on the statistics of Words" (Samuel Thirion). [[XXX - Sylvie Greverend, that used to work in text mining, lemming, etc. - Ask Jyhem]]
## Geographic Statistics

# And all these analysis, split them: 
## (A) by "Homogenous groups", or [descriptive of current data]
## (B) by Administrative level (GeoLocated Data) [prospective statistics]

# Maite:
# Non Probability > Focus group > 
# our basic data (population) is not the individuals, but the criteria they said. So the results will not be reffered to the 
# Relationships between VARIABLES 
## __Multivariate analysis__:
### CA: Correspondence Anaysis
### CCA: Canonical Correspondce Analysis
## Or __Text minnig__ remembering that our population are not individuals but criteria written by the individuals sampled.
#
# The data is always nominal, categorical... (you can't do "histograms", but only bar charts; 
#   the shape can be the same, but the theoretical framework is not valid for this type of categorical data; 
# histograms are for continuous numerical variables)
#
# Chi-Square test to see relationship between variables.
## chisq.test
# Cluster Analysis:
# CA: Correspondence Analysis.
## ca , FactoMineR
# CCA: Canonical Correspondence (Correlation) Analysis (it's better when you have lots of data variables)
## Vegan, yADE4 [ade4TkGUI]
# Text Mining
## tm
# Spantial analysis 
## spatial, maptools, xxxx

# DIMENSIONS in Spiral Project, with the basic indicator groups (A, B, C, D, E, F):
# (as read from tracker2: https://wikispiral.org/tracker2 )
# A   Access to essential ressources in general
# B   Living environment in general
# C   Relations with and between organisations in general
# D   Personal relations in general
# E   Social balance in general
# F   Personal balance in general
# G   Well-being or ill-being feelings in general
# H   Attitudes and initiatives in general
# I   Relationships within society

# There 4 questions
# 1st: WB - What does for you well-being mean?
# 2nd: IB - What does for you ill-being mean?
# 3rd: ACT - What do you do or would like to do for your well-being and the well-being at all?
# 4rt: FG - Seing your actual statements, what do you think is needed to grant this well being for the future generations?

# 4 Categories (of Answers). Referred as "sig_re_1" by the charts from Maite (with data from Cape Verd)
# -------------------------
# O = Obtention
# P = Possibility
# Q = Quality
# S = Sustainability
#
# Charts they like to show results from Spiral (for the Analsysis toolbox to be shown in the wikispiral website):
# ----------------------------------------------
# * stacked bar/column charts.
# * CA for local groups, one per question.
# * Dimensions and questions: which are the dimensions most linked to well-being, to ill-being
# * Comparisons among regional data: 
#     a) Thematic maps (just display on top of maps). 
#     b) Spatial Statistics Analysis (you have new variables, related to the coordinates in the maps; 
#       you can study by polygons (countries), by coordinates, ). 3 types:
#         b.1.) by points
#         b.2.) by shapes (polygons)
#         b.3.) by raster
# * (Word-cloud, maybe)
#
# Matthias Ansorg
# -----------------
# Two ways to structure databases (depending on the future use)
# * OLTP: Online Transaction Processing (just a few queries in websites)
# * OLAP: Online analysis Processing (many joins on hundreds of tables in the db, etc)
# * Data Warehouses
# He does what he calls "Data Cubes" [like rubik cube of data]: multidimensional matrix, created by n dimensions.
# * Dimension: Analysitical view of the data
# * Fact (cells in the cube matrix): criteria
#
#
# R views to have into account: [Xavi]
# -----------------------------
# * http://cran.r-project.org/web/views/SocialSciences.html
# * http://cran.r-project.org/web/views/Spatial.html
# * http://cran.r-project.org/web/views/NaturalLanguageProcessing.html
# * http://cran.r-project.org/web/views/Graphics.html
#
# R packages to explore (me):
# --------------------------
# spatial (maite)
# osmar (OSM Maps in R - find it out myself Xavi)
# ade4TkGUI
# FactoMineR

###################################