Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Movielens exercise gomez_enrique #92

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ docs/site/
# committed for packages, but should be committed for applications that require a static
# environment.
Manifest.toml

# MAC OS
.DS_Store
Empty file added bash/README.md
Empty file.
56 changes: 56 additions & 0 deletions bash/src/student_submissions/movielens/gomez_enrique/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# How to run

```bash
sh run.sh filepath_ratings filepath_movies
```

Tested with `awk` and `split` from https://www.gnu.org/software/coreutils/, and `SQLite` version 3.43.2.

__Important note__: awk and split are aliased as `gawk` and `gsplit` on my machine. Change accordingly inside `run.sh`.


# Result

```
File split (gnu split):

real 0m0.398s
user 0m0.002s
sys 0m0.178s

Inner join (sqlite):

real 0m6.993s
user 0m51.856s (sum of the time of each individual process spawned)
sys 0m1.247s

Average rating (gnu awk):

real 0m0.155s
user 0m0.036s
sys 0m0.005s
```

| GENRE | RATING |
| ------------------ | -------- |
| (no genres listed) | 3.326379 |
| Action | 3.466592 |
| Adventure | 3.517445 |
| Animation | 3.614946 |
| Children | 3.432507 |
| Comedy | 3.423993 |
| Crime | 3.685044 |
| Documentary | 3.705281 |
| Drama | 3.677185 |
| Fantasy | 3.511589 |
| Film-Noir | 3.925728 |
| Horror | 3.293563 |
| IMAX | 3.603712 |
| Musical | 3.554716 |
| Mystery | 3.670169 |
| Romance | 3.542712 |
| Sci-Fi | 3.478143 |
| Thriller | 3.522964 |
| War | 3.791466 |
| Western | 3.585755 |

18 changes: 18 additions & 0 deletions bash/src/student_submissions/movielens/gomez_enrique/average.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
BEGIN {
FS=","
}

{ # skip header column
split($1, genres, "|") # split genres (2nd column)
for(idx in genres) {
key = genres[idx]
rating[key] += $2 # sum ratings (3rd column)
observations[key] += $3 # sum observations (4th column)
}
}

END {
for(key in rating){
printf("%s,%f\n", key, rating[key]/observations[key])
}
}
46 changes: 46 additions & 0 deletions bash/src/student_submissions/movielens/gomez_enrique/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
filepath_ratings=$1
filepath_movies=$2

echo "File split:"
time gsplit -d -C 50M $filepath_ratings --additional-suffix .csv "tmp_split_"
tail -n +2 tmp_split_00.csv > tmp_split_00.tmp && mv tmp_split_00.tmp tmp_split_00.csv

for filepath_split in tmp_split_*.csv; do
filename=$(basename -- "$filepath_split")
extension="${filename##*.}"
filename="${filename%.*}"
sqlite3 ":memory:" <<SQL &
.output /dev/null
pragma journal_mode = OFF;
pragma synchronous = OFF;
pragma journal_size_limit = 0;

.import $filepath_movies movies --csv
create unique index idx_movieid on movies(movieId);

create table ratings(userId, movieId, rating, timestamp);
.import $filepath_split ratings --csv

.mode csv
.header off
.out tmp_innerjoin_$filename.csv

select genres, sum(rating), count(rating)
from movies inner join ratings
on movies.movieId = ratings.movieId
group by genres;

.exit
SQL
done

echo "\nInner join:"
time wait

echo "\nAverage rating:"
cat tmp_innerjoin_*.csv > tmp_ratings_min.csv
time gawk -f average.awk tmp_ratings_min.csv | sort > tmp_out.csv

cat tmp_out.csv

rm tmp_*.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# How to run

```console
$ go build
$ ./movielens file_to_split total_splits out_folder file_to_merge
```

where:
- `file_to_split`: ratings.csv file
- `total_splits`: total number of files to be splitted into
- `out_folder`: directory where the splitted files will be stored
- `file_to_merge`: movies.csv file

# Example

```bash
./movielens "ratings.csv" 10 "." "movies.csv"
```

## Results

__Split: 229ms__

__Merge & Count: 10363ms__

| GENRE | AVERAGE_RATING |
|-------------|----------------------------------|
| (no genres listed) | 3.326379239118188 |
| Action | 3.466591472228235 |
| Adventure | 3.517444379462875 |
| Animation | 3.614946348438093 |
| Children | 3.4325074920278045 |
| Comedy | 3.423992522260525 |
| Crime | 3.6850431095379377 |
| Documentary | 3.7052805249822454 |
| Drama | 3.6771844525139366 |
| Fantasy | 3.5115889157486 |
| Film-Noir | 3.9257258540768367 |
| Horror | 3.2935633075659174 |
| IMAX | 3.6037121959523324 |
| Musical | 3.5547170809260242 |
| Mystery | 3.670169244577933 |
| Romance | 3.542711629764427 |
| Sci-Fi | 3.4781437345156516 |
| Thriller | 3.522964338694794 |
| War | 3.7914657875591984 |
| Western | 3.585752382527443 |

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module movielens

go 1.23.1

require github.com/kfultz07/go-dataframe v0.0.0-20240718202924-1901644284c7

require (
github.com/aws/aws-sdk-go v1.44.57 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect
gopkg.in/yaml.v2 v2.3.0 // indirect
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
github.com/aws/aws-sdk-go v1.44.57 h1:Dx1QD+cA89LE0fVQWSov22tpnTa0znq2Feyaa/myVjg=
github.com/aws/aws-sdk-go v1.44.57/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/kfultz07/go-dataframe v0.0.0-20240718202924-1901644284c7 h1:ss8yd7vvaO+HMbZ5x9dnRyUWeX+Mlto1dpTJc/xKo/E=
github.com/kfultz07/go-dataframe v0.0.0-20240718202924-1901644284c7/go.mod h1:78T0LPP6YPXqorHDU0teK01g3gjNAurB5TJW3o/AF5c=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY=
golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8=
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk=
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package main

import (
"fmt"
"movielens/split"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"time"

dataframe "github.com/kfultz07/go-dataframe"
)

type Stats struct {
Rating float64
Observations int
}

var lock = &sync.RWMutex{}

func main() {
// Arg 1: file to split
filename := os.Args[1]

// Arg 2: total chunks
totalChunks, err := strconv.Atoi(os.Args[2])
if err != nil {
panic(err)
}

// Arg3: out-path
outPath := os.Args[3]

// Arg4: file to inner join with
filenameJoin := os.Args[4]

// Split file
start := time.Now()
split.Split(filename, totalChunks, outPath)
elapsed := time.Since(start)
fmt.Printf("Split: %dms\n", elapsed.Milliseconds())

// TODO: use generic filepaths
dfMovies := dataframe.CreateDataFrame("", filenameJoin)

// Create a map to keep track of stats
stats := make(map[string]*Stats)

// Process file
matches, _ := filepath.Glob(filepath.Join(outPath, "tmp_ratings*.csv"))

// Parallel procesing
start = time.Now()
var wg sync.WaitGroup
for _, p := range matches {
wg.Add(1)
go func() {
processFile(p, dfMovies, stats)
wg.Done()
}()
}
wg.Wait()
elapsed = time.Since(start)
fmt.Printf("Merge & Count: %dms\n\n", elapsed.Milliseconds())

// Sequential processing
//for _, p := range matches {
// processFile(p, dfMovies, stats)
//}

// Sort stats
keys := make([]string, 0, len(stats))
for k := range stats {
keys = append(keys, k)
}
sort.Strings(keys)

// Print stats
for _, k := range keys {
fmt.Println(k, stats[k].Rating/float64(stats[k].Observations))
}
}

func processFile(filename string, dfMovies dataframe.DataFrame, stats map[string]*Stats) {
dfRatings := dataframe.CreateDataFrame("./", filename)
// TODO: check if I'm not loosing any row with values after replacing headers
dfRatings.Headers = map[string]int{"userId": 0, "movieId": 1, "rating": 2, "timestamp": 4}
err := dfRatings.Merge(&dfMovies, "movieId", "genres")
if err != nil {
panic(err)
}

for _, row := range dfRatings.KeepColumns([]string{"genres", "rating"}).FrameRecords {
// Get rating data
rating, err := strconv.ParseFloat(row.Data[1], 64)
if err != nil {
panic(err)
}

// Split into individual genres
genres := strings.Split(row.Data[0], "|")

// Count per genre
lock.Lock()
for _, key := range genres {
_, keyExists := stats[key]
if !keyExists {
stats[key] = new(Stats)
}
stats[key].Rating += rating
stats[key].Observations += 1
}
lock.Unlock()
}
}
Loading