-
Notifications
You must be signed in to change notification settings - Fork 0
/
FileIO.scala
122 lines (103 loc) · 3.75 KB
/
FileIO.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package search.src
import java.io.{BufferedReader, BufferedWriter, FileReader, FileWriter}
import scala.collection.mutable.HashMap
object FileIO {
/**
* Print title index for each document, formatted as follows:
* ID_1::Title_1
* ID_2::Title_2
* ...
*/
def printTitleFile(titleFile: String, idsToTitles: HashMap[Int, String]) {
val titleWriter = new BufferedWriter(new FileWriter(titleFile))
for ((id, title) <- idsToTitles) {
titleWriter.write(id + "::" + title + "\n")
}
titleWriter.close()
}
/**
* Print euclidean normalization and page rank for each document,
* formatted as follows:
* ID_1 Ed_1 PageRank_1
* ID_2 Ed_2 PageRank_2
* ...
*/
def printDocumentFile(documentFile: String,
idsToMaxCounts: HashMap[Int, Double],
idsToPageRanks: HashMap[Int, Double]) {
val docWriter = new BufferedWriter(new FileWriter(documentFile))
for ((id, ed) <- idsToMaxCounts) {
docWriter.write(id + " " + ed + " " + idsToPageRanks(id) + "\n")
}
docWriter.close()
}
/**
* Prints documents ID with frequency
* for each word, formatted as follows:
* Word_1 ID_a Freq_a ID_b Freq_b ...
* Word_2 ID_c Freq_c ID_d Freq_d ...
* ...
*/
def printWordsFile(wordsFile: String,
wordsToDocumentFrequencies: HashMap[String, HashMap[Int, Double]]) {
val wordWriter = new BufferedWriter(new FileWriter(wordsFile))
for ((word, freqMap) <- wordsToDocumentFrequencies) {
wordWriter.write(word + " ")
// print ids of documents followed by frequency of word in that document
for ((id, frequency) <- freqMap) {
wordWriter.write(id + " " + frequency + " ")
}
wordWriter.write("\n")
}
wordWriter.close()
}
/**
* Reads the title index and populates idsToTitle map
*/
def readTitles(titleIndex: String, idsToTitle: HashMap[Int, String]) {
val titlesReader = new BufferedReader(new FileReader(titleIndex))
var line = titlesReader.readLine()
while (line != null) {
val tokens = line.split("::")
// Create map of document ids to document titles
idsToTitle(tokens(0).toInt) = tokens(1)
line = titlesReader.readLine()
}
titlesReader.close()
}
/**
* Reads the word index and populates wordToDocuments
*/
def readWords(wordIndex: String,
wordsToDocumentFrequencies: HashMap[String, HashMap[Int, Double]]) {
val wordsReader = new BufferedReader(new FileReader(wordIndex))
var line = wordsReader.readLine()
while (line != null) {
val tokens = line.split(" ")
// Create map of document ids to frequencies of current word in that document
wordsToDocumentFrequencies(tokens(0)) = new HashMap[Int, Double]
for (i <- 1 until (tokens.size - 1) by 2) {
// wordsToDocumentFrequencies(tokens(0))(tokens(i).toInt) = tokens(i + 1).toInt
wordsToDocumentFrequencies(tokens(0)) += (tokens(i).toInt -> tokens(i + 1).toDouble)
}
line = wordsReader.readLine()
}
wordsReader.close()
}
/**
* Reads the document index and populates idstoEd and idsToPageRank
*/
def readDocuments(documentIndex: String, idsToMaxFreqs: HashMap[Int, Double], idsToPageRank: HashMap[Int, Double]) {
val documentsReader = new BufferedReader(new FileReader(documentIndex))
var line = documentsReader.readLine()
while (line != null) {
val tokens = line.split(" ")
// Save max word frequency for each document in map
idsToMaxFreqs(tokens(0).toInt) = tokens(1).toDouble
// Save page rank for each document in map
idsToPageRank(tokens(0).toInt) = tokens(2).toDouble
line = documentsReader.readLine()
}
documentsReader.close()
}
}