forked from scoppens/eic-entity-tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
required_siren_schema.xml
183 lines (142 loc) · 7.27 KB
/
required_siren_schema.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
<?xml version="1.0" encoding="UTF-8" ?>
<!--
This is the SIREn/Solr schema file. This file should be named "schema.xml" and
should be in the conf directory under the solr home
(i.e. ./solr/conf/schema.xml by default)
or located where the classloader for the Solr webapp can find it.
This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.
For more information, on how to customize this file, please see
http://wiki.apache.org/solr/SchemaXml
PERFORMANCE NOTE: this schema includes many optional features and should not
be used for benchmarking. To improve performance one could
- set stored="false" for all fields possible (esp large fields) when you
only need to search on the field but don't need to return the original
value.
- set indexed="false" if you don't need to search on the field, but only
return the field as a result of searching on other indexed fields.
- remove all unneeded copyField statements
- for best index size and searching performance, set "index" to false
for all general text fields, use copyField to copy them to the
catchall "text" field, and use that for searching.
- For maximum indexing performance, use the StreamingUpdateSolrServer
java client.
- Remember to run the JVM in server mode, and use a higher logging level
that avoids logging every request
-->
<schema name="example" version="1.3">
<types>
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- A Trie based date field for faster date range queries and date faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
<!-- A uri field that uses WhitespaceTokenizer and WordDelimiterFilter to
split URIs into multiple compoenents. Stopwords is customized by
external files.
omitNorms is true since it is a short field, and it does not make
really sense on URI.
Does not use the ASCIIFoldingExpansionFilter since URIs should not
contain accented characters.
-->
<fieldType name="uri" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- Splits words into subwords based on delimiters
- split subwords based on case change
- preserveOriginal="1" in order to preserve the original word.
Removed split based on numerics to fix SND-355 and SND-1283
-->
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"
splitOnNumerics="0"
preserveOriginal="1"/>
<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>
<!-- Change to lowercase text -->
<filter class="solr.LowerCaseFilterFactory"/>
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
</analyzer>
<analyzer type="query">
<!-- whitespace tokenizer to not tokenize URI -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<!-- Replace Qnames by their name spaces in URIs. -->
<filter class="org.sindice.siren.solr.analysis.QNamesFilterFactory"
qnames="qnames.txt"/>
</analyzer>
</fieldType>
<!--
The SIREn field type:
The top-level analyzers must be defined in the top-level analyzer
configuration file (ntriple-analyzers.xml) and the datatype analyzers in
the datatype analyzer configuration file (ntriples-datatypes.xml).
Field norms are not useful for SIREn fields. Set omitNorms to true reduces
memory consumption, and improve ranking.
omitTermFreqAndPositions *must* be set to false.
-->
<fieldType name="ntriple" class="org.sindice.siren.solr.schema.SirenField"
omitNorms="true"
omitTermFreqAndPositions="false"
analyzerConfig="tuple-analyzers.xml"
datatypeConfig="tuple-datatypes.xml"/>
<fieldType name="tabular" class="org.sindice.siren.solr.schema.SirenField"
omitNorms="true"
omitTermFreqAndPositions="false"
analyzerConfig="tuple-analyzers.xml"
datatypeConfig="tuple-datatypes.xml"/>
</types>
<fields>
<!-- The ID (URL) of the document
Use the 'string' field type (no tokenisation)
-->
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="label" type="string" indexed="true" stored="true" required="false"/>
<field name="description" type="string" indexed="true" stored="true" required="false"/>
<!-- The URL of the document
Use the 'text' field type in order to be tokenised
-->
<field name="url" type="uri" indexed="true" stored="true" required="true"/>
<field name="type" type="ntriple" indexed="true" stored="true" required="false"/>
<!-- n-triple indexing scheme -->
<field name="ntriple" type="ntriple" indexed="true" stored="true" multiValued="false"/>
<!-- tabular indexing scheme -->
<field name="tabular" type="tabular" indexed="true" stored="false" multiValued="false"/>
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>ntriple</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="AND"/>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="url" dest="id"/>
<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity for Siren is specified here -->
<similarity class="org.sindice.siren.similarity.SirenSimilarity"/>
</schema>