camachojua · egmzcrz · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,6 @@ docs/site/
 # committed for packages, but should be committed for applications that require a static
 # environment.
 Manifest.toml
+
+# MAC OS
+.DS_Store
diff --git a/bash/README.md b/bash/README.md
diff --git a/bash/src/student_submissions/movielens/gomez_enrique/README.md b/bash/src/student_submissions/movielens/gomez_enrique/README.md
@@ -0,0 +1,56 @@
+# How to run
+
+```bash
+sh run.sh filepath_ratings filepath_movies
+```
+
+Tested with `awk` and `split` from https://www.gnu.org/software/coreutils/, and `SQLite` version 3.43.2.
+
+__Important note__: awk and split are aliased as `gawk` and `gsplit` on my machine. Change accordingly inside `run.sh`.
+
+
+# Result
+
+```
+File split (gnu split):
+
+real    0m0.398s
+user    0m0.002s
+sys     0m0.178s
+
+Inner join (sqlite):
+
+real    0m6.993s
+user    0m51.856s (sum of the time of each individual process spawned)
+sys     0m1.247s
+
+Average rating (gnu awk):
+
+real    0m0.155s
+user    0m0.036s
+sys     0m0.005s
+```
+
+| GENRE              | RATING   |
+| ------------------ | -------- |
+| (no genres listed) | 3.326379 |
+| Action             | 3.466592 |
+| Adventure          | 3.517445 |
+| Animation          | 3.614946 |
+| Children           | 3.432507 |
+| Comedy             | 3.423993 |
+| Crime              | 3.685044 |
+| Documentary        | 3.705281 |
+| Drama              | 3.677185 |
+| Fantasy            | 3.511589 |
+| Film-Noir          | 3.925728 |
+| Horror             | 3.293563 |
+| IMAX               | 3.603712 |
+| Musical            | 3.554716 |
+| Mystery            | 3.670169 |
+| Romance            | 3.542712 |
+| Sci-Fi             | 3.478143 |
+| Thriller           | 3.522964 |
+| War                | 3.791466 |
+| Western            | 3.585755 |
+
diff --git a/bash/src/student_submissions/movielens/gomez_enrique/average.awk b/bash/src/student_submissions/movielens/gomez_enrique/average.awk
@@ -0,0 +1,18 @@
+BEGIN {
+    FS=","
+}
+
+{                                 # skip header column
+    split($1, genres, "|")        # split genres (2nd column)
+    for(idx in genres) {
+        key = genres[idx]
+        rating[key] += $2         # sum ratings (3rd column)
+        observations[key] += $3   # sum observations (4th column)
+    }
+}
+
+END {
+    for(key in rating){
+        printf("%s,%f\n", key, rating[key]/observations[key])
+    }
+}
diff --git a/bash/src/student_submissions/movielens/gomez_enrique/run.sh b/bash/src/student_submissions/movielens/gomez_enrique/run.sh
@@ -0,0 +1,46 @@
+filepath_ratings=$1
+filepath_movies=$2
+
+echo "File split:"
+time gsplit -d -C 50M $filepath_ratings --additional-suffix .csv "tmp_split_"
+tail -n +2 tmp_split_00.csv > tmp_split_00.tmp && mv tmp_split_00.tmp tmp_split_00.csv
+
+for filepath_split in tmp_split_*.csv; do
+    filename=$(basename -- "$filepath_split")
+    extension="${filename##*.}"
+    filename="${filename%.*}"
+    sqlite3 ":memory:" <<SQL &
+.output /dev/null
+pragma journal_mode = OFF;
+pragma synchronous = OFF;
+pragma journal_size_limit = 0;
+
+.import $filepath_movies movies --csv
+create unique index idx_movieid on movies(movieId);
+
+create table ratings(userId, movieId, rating, timestamp);
+.import $filepath_split ratings --csv
+
+.mode csv
+.header off
+.out tmp_innerjoin_$filename.csv
+
+select genres, sum(rating), count(rating)
+from movies inner join ratings
+on movies.movieId = ratings.movieId
+group by genres;
+
+.exit
+SQL
+done
+
+echo "\nInner join:"
+time wait
+
+echo "\nAverage rating:"
+cat tmp_innerjoin_*.csv > tmp_ratings_min.csv
+time gawk -f average.awk tmp_ratings_min.csv | sort > tmp_out.csv
+
+cat tmp_out.csv
+
+rm tmp_*.csv
diff --git a/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/README.md b/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/README.md
@@ -0,0 +1,48 @@
+# How to run
+
+```console
+$ go build
+$ ./movielens file_to_split total_splits out_folder file_to_merge
+```
+
+where:
+- `file_to_split`: ratings.csv file
+- `total_splits`: total number of files to be splitted into
+- `out_folder`: directory where the splitted files will be stored
+- `file_to_merge`: movies.csv file
+
+# Example
+
+```bash
+./movielens "ratings.csv" 10  "." "movies.csv"
+```
+
+## Results
+
+__Split: 229ms__
+
+__Merge & Count: 10363ms__
+
+| GENRE       | AVERAGE_RATING                   |
+|-------------|----------------------------------|
+| (no genres listed)         | 3.326379239118188 |
+| Action      | 3.466591472228235                |
+| Adventure   | 3.517444379462875                |
+| Animation   | 3.614946348438093                |
+| Children    | 3.4325074920278045               |
+| Comedy      | 3.423992522260525                |
+| Crime       | 3.6850431095379377               |
+| Documentary | 3.7052805249822454               |
+| Drama       | 3.6771844525139366               |
+| Fantasy     | 3.5115889157486                  |
+| Film-Noir   | 3.9257258540768367               |
+| Horror      | 3.2935633075659174               |
+| IMAX        | 3.6037121959523324               |
+| Musical     | 3.5547170809260242               |
+| Mystery     | 3.670169244577933                |
+| Romance     | 3.542711629764427                |
+| Sci-Fi      | 3.4781437345156516               |
+| Thriller    | 3.522964338694794                |
+| War         | 3.7914657875591984               |
+| Western     | 3.585752382527443                |
+
diff --git a/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/go.mod b/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/go.mod
@@ -0,0 +1,13 @@
+module movielens
+
+go 1.23.1
+
+require github.com/kfultz07/go-dataframe v0.0.0-20240718202924-1901644284c7
+
+require (
+	github.com/aws/aws-sdk-go v1.44.57 // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/jmespath/go-jmespath v0.4.0 // indirect
+	golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect
+	gopkg.in/yaml.v2 v2.3.0 // indirect
+)
diff --git a/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/go.sum b/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/go.sum
@@ -0,0 +1,29 @@
+github.com/aws/aws-sdk-go v1.44.57 h1:Dx1QD+cA89LE0fVQWSov22tpnTa0znq2Feyaa/myVjg=
+github.com/aws/aws-sdk-go v1.44.57/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
+github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
+github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
+github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
+github.com/kfultz07/go-dataframe v0.0.0-20240718202924-1901644284c7 h1:ss8yd7vvaO+HMbZ5x9dnRyUWeX+Mlto1dpTJc/xKo/E=
+github.com/kfultz07/go-dataframe v0.0.0-20240718202924-1901644284c7/go.mod h1:78T0LPP6YPXqorHDU0teK01g3gjNAurB5TJW3o/AF5c=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY=
+golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8=
+golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk=
+golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
+gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/main.go b/go/src/student_submissions/movielens/gomez_enrique/genre_ratings/main.go
@@ -0,0 +1,118 @@
+package main
+
+import (
+	"fmt"
+	"movielens/split"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	dataframe "github.com/kfultz07/go-dataframe"
+)
+
+type Stats struct {
+	Rating       float64
+	Observations int
+}
+
+var lock = &sync.RWMutex{}
+
+func main() {
+	// Arg 1: file to split
+	filename := os.Args[1]
+
+	// Arg 2: total chunks
+	totalChunks, err := strconv.Atoi(os.Args[2])
+	if err != nil {
+		panic(err)
+	}
+
+	// Arg3: out-path
+	outPath := os.Args[3]
+
+	// Arg4: file to inner join with
+	filenameJoin := os.Args[4]
+
+	// Split file
+	start := time.Now()
+	split.Split(filename, totalChunks, outPath)
+	elapsed := time.Since(start)
+	fmt.Printf("Split: %dms\n", elapsed.Milliseconds())
+
+	// TODO: use generic filepaths
+	dfMovies := dataframe.CreateDataFrame("", filenameJoin)
+
+	// Create a map to keep track of stats
+	stats := make(map[string]*Stats)
+
+	// Process file
+	matches, _ := filepath.Glob(filepath.Join(outPath, "tmp_ratings*.csv"))
+
+	// Parallel procesing
+	start = time.Now()
+	var wg sync.WaitGroup
+	for _, p := range matches {
+		wg.Add(1)
+		go func() {
+			processFile(p, dfMovies, stats)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+	elapsed = time.Since(start)
+	fmt.Printf("Merge & Count: %dms\n\n", elapsed.Milliseconds())
+
+	// Sequential processing
+	//for _, p := range matches {
+	//	processFile(p, dfMovies, stats)
+	//}
+
+	// Sort stats
+	keys := make([]string, 0, len(stats))
+	for k := range stats {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	// Print stats
+	for _, k := range keys {
+		fmt.Println(k, stats[k].Rating/float64(stats[k].Observations))
+	}
+}
+
+func processFile(filename string, dfMovies dataframe.DataFrame, stats map[string]*Stats) {
+	dfRatings := dataframe.CreateDataFrame("./", filename)
+	// TODO: check if I'm not loosing any row with values after replacing headers
+	dfRatings.Headers = map[string]int{"userId": 0, "movieId": 1, "rating": 2, "timestamp": 4}
+	err := dfRatings.Merge(&dfMovies, "movieId", "genres")
+	if err != nil {
+		panic(err)
+	}
+
+	for _, row := range dfRatings.KeepColumns([]string{"genres", "rating"}).FrameRecords {
+		// Get rating data
+		rating, err := strconv.ParseFloat(row.Data[1], 64)
+		if err != nil {
+			panic(err)
+		}
+
+		// Split into individual genres
+		genres := strings.Split(row.Data[0], "|")
+
+		// Count per genre
+		lock.Lock()
+		for _, key := range genres {
+			_, keyExists := stats[key]
+			if !keyExists {
+				stats[key] = new(Stats)
+			}
+			stats[key].Rating += rating
+			stats[key].Observations += 1
+		}
+		lock.Unlock()
+	}
+}