Abril 2019

Matrices masivas

  • Cargar un archivo mayor al espacio en RAM

Matrices masivas

  • Cargar un archivo mayor al espacio en RAM
du -h /tmp/bigmem.tsv
## 1,1G    /tmp/bigmem.tsv
free -h
##               total        used        free
## Mem:           480M        287M         26M
## Swap:          472M         64M        408M

Matrices masivas

big_table <- read.table(file = "/tmp/bigmem.tsv", sep = "\t",
                        header = TRUE)
## Error: cannot allocate vector of size 1.1 G

Matrices masivas

big_table <- read.table(file = "/tmp/bigmem.tsv", sep = "\t",
                        header = TRUE)
## Error: cannot allocate vector of size 1.1 G
  • Solución:

    No cargar en RAM
    
    Cargar en RAM un puntero y metadata del archivo

Matrices masivas

big_table <- read.table(file = "/tmp/bigmem.tsv", sep = "\t",
                        header = TRUE)
## Error: cannot allocate vector of size 1.1 G
  • Solución:

https://cran.r-project.org/web/views/

 |- High-Performance and Parallel Computing with R

   |- Large memory and out-of-memory data
   
       |- ff package: offers file-based access to data sets that are
          too large to be loaded into memory, along with a number of
          higher-level functions.
          (ver sus "Reverse ...")

Matrices masivas

library("ff")
# big_table <- read.table(file = "/tmp/bigmem.tsv", sep = "\t",
#                         header = TRUE)
big_table <- read.table.ffdf(file = "/tmp/bigmem.tsv", sep = "\t",
                             header = TRUE)

Matrices masivas

library("ff")
# big_table <- read.table(file = "/tmp/bigmem.tsv", sep = "\t",
#                         header = TRUE)
big_table <- read.table.ffdf(file = "/tmp/bigmem.tsv", sep = "\t",
                             header = TRUE)
class(big_table)
# [1] "ffdf"
dim(big_table)
# [1] 10000000        6

Matrices masivas

colnames(big_table)
# [1] "V1" "V2" "V3" "V4" "V5" "V6"
class(big_table[,1])
# [1] "numeric"
class(big_table$V1)
# [1] "ff_vector" "ff"

Matrices masivas

system.time({ aux <- big_table[,1] })
#    user  system elapsed 
#   0.169   0.000   0.170
system.time({ aux <- big_table$V1 })
#    user  system elapsed 
#       0       0       0

Matrices masivas

  • Como trabajar?

Supongamos que queremos calcular la media para cada columna ~> divide y vencerás

Matrices masivas

  • Como trabajar?

Supongamos que queremos calcular la media para cada columna ~> divide y vencerás

chunks_lims <- c(seq(1, nrow(big_table), by=10000), nrow(big_table))
length(chunks_lims)
# [1] 1001

Dividimos cada columna en 1000 sub-vectores de 10000 números.

Matrices masivas

chunks_lims <- c(seq(1, nrow(big_table), by=10000), nrow(big_table))

unlist(lapply(colnames(big_table), function(i) { # para cada columna
  # para cada sub-vector
  sums <- unlist(lapply(seq_along(chunks_lims)[-1], 
                                function(chunk_i) {
    sum(big_table[ # sumo los valores de cada sub-vector
      seq(chunks_lims[[chunk_i-1]], chunks_lims[[chunk_i]]-1),
      i])
  }))
  # sumo las sumas de los sub-vectores, y divido por long total
  sum(sums) / nrow(big_table)
}))
# [1] -1.340206e-04 -1.644488e-05  4.895380e-04  3.787050e-04 -4.588027e-05  3.908912e-04

Matrices masivas

chunks_lims <- c(seq(1, nrow(big_table), by=10000), nrow(big_table))

colSums(do.call(
  rbind,
  ffrowapply(colSums(big_table[i1:i2, , drop = FALSE]),
             X = big_table,
             RETURN = TRUE,
             CFUN = "list",
             BATCHSIZE = 10000)
)) / nrow(big_table)
#            V1            V2            V3            V4            V5            V6 
# -1.340625e-04 -1.635816e-05  4.894799e-04  3.786488e-04 -4.603215e-05

Utilizar múltiples nodos

  1. Enviando múltiples tareas: SLURM se encarga

  2. Utilizar MPI: Dentro de R (Rmpi) ~> requiere trabajo

Utilizar múltiples nodos

Configurando Rmpi (en sistema SLURM)

module load R
Rscript -e ".libPaths()[[1]]"
## [1] "/home/jcrodriguez/local/R_libs/3.5.2"

Utilizar múltiples nodos

Configurando Rmpi (en sistema SLURM)

module load R
Rscript -e ".libPaths()[[1]]"
## [1] "/home/jcrodriguez/local/R_libs/3.5.2"
ls /home/jcrodriguez/local/R_libs/3.5.2/Rmpi/Rprofile
## /home/jcrodriguez/local/R_libs/3.5.2/Rmpi/Rprofile

Utilizar múltiples nodos

Configurando Rmpi (en sistema SLURM)

module load R
Rscript -e ".libPaths()[[1]]"
## [1] "/home/jcrodriguez/local/R_libs/3.5.2"
ls /home/jcrodriguez/local/R_libs/3.5.2/Rmpi/Rprofile
## /home/jcrodriguez/local/R_libs/3.5.2/Rmpi/Rprofile
# Cuidado de no pisar nuestro ~/.Rprofile
cp /home/jcrodriguez/local/R_libs/3.5.2/Rmpi/Rprofile ~/.Rprofile

Utilizar múltiples nodos

Algunas funciones (de ~125):

library("Rmpi")

mpi.universe.size() # number of CPUs available in a cluster
mpi.spawn.Rslaves(nslaves = ns) # spawns R slaves to the nodes
mpi.close.Rslaves() # shuts down R slaves spawned
mpi.quit() # terminates MPI execution environment and quits R

Utilizar múltiples nodos

Algunas funciones (de ~125):

library("Rmpi")

mpi.universe.size() # number of CPUs available in a cluster
mpi.spawn.Rslaves(nslaves = ns) # spawns R slaves to the nodes
mpi.close.Rslaves() # shuts down R slaves spawned
mpi.quit() # terminates MPI execution environment and quits R

# execute cmd(...) on R slaves
mpi.bcast.cmd(cmd, ...)

# execute cmd(...) on R slaves and return all executed results
mpi.remote.exec(cmd, ...)

Ejercicio

  • Ejecutar código paralelizable (e.g., simulaciones independientes) en Mendieta en más de un core:

    • script R paralelo (mclapply ó parLapply)
    • script sh para ejecutarlo

Ejercicio

library("thesimpsons")

Ejercicio

library("thesimpsons")

colnames(characters)
## [1] "id"              "name"            "normalized_name" "sex"
colnames(episodes)
##  [1] "id"                     "title"                 
##  [3] "original_air_date"      "production_code"       
##  [5] "season"                 "number_in_season"      
##  [7] "number_in_series"       "us_viewers_in_millions"
##  [9] "views"                  "imdb_rating"           
## [11] "imdb_votes"             "image_url"             
## [13] "video_url"

Ejercicio

library("thesimpsons")

colnames(characters)
colnames(episodes)
colnames(locations)
# [1] "id"              "name"            "normalized_name"
colnames(script_lines)
#  [1] "id"                 "episode_id"         "number"            
#  [4] "raw_text"           "timestamp_in_ms"    "speaking_line"     
#  [7] "character_id"       "location_id"        "raw_character_text"
# [10] "raw_location_text"  "spoken_words"       "normalized_text"   
# [13] "word_count" 

Ejercicio

Posibles preguntas:

  • Quien dijo más palabras en cada ubicación?

  • Palabras más utilizadas por cada personaje?

  • Personajes con más palabras en cada temporada?