diff --git a/tika/detector.py b/tika/detector.py index 68480c5..558e0bf 100644 --- a/tika/detector.py +++ b/tika/detector.py @@ -19,10 +19,20 @@ from .tika import detectType1, callServer, ServerEndpoint def from_file(filename): + ''' + Detects MIME type of specified file + :param filename: file whose type needs to be detected + :return: MIME type + ''' jsonOutput = detectType1('type', filename) return jsonOutput[1] def from_buffer(string): + ''' + Detects MIME type of the buffered content + :param string: buffered content whose type needs to be detected + :return: + ''' status, response = callServer('put', ServerEndpoint, '/detect/stream', string, {'Accept': 'text/plain'}, False) return response diff --git a/tika/language.py b/tika/language.py index 8d5b5be..721f248 100644 --- a/tika/language.py +++ b/tika/language.py @@ -19,10 +19,20 @@ from .tika import detectLang1, callServer, ServerEndpoint def from_file(filename): + ''' + Detects language of the file + :param filename: path to file whose language needs to be detected + :return: + ''' jsonOutput = detectLang1('file', filename) return jsonOutput[1] def from_buffer(string): + ''' + Detects language of content in the buffer + :param string: buffered data + :return: + ''' status, response = callServer('put', ServerEndpoint, '/language/string', string, {'Accept': 'text/plain'}, False) return response diff --git a/tika/parser.py b/tika/parser.py index 2cc2e07..df117ca 100644 --- a/tika/parser.py +++ b/tika/parser.py @@ -21,6 +21,15 @@ import json def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False): + ''' + Parses a file for metadata and content + :param filename: path to file which needs to be parsed + :param serverEndpoint: Server endpoint url + :param xmlContent: Whether or not XML content be requested. + Default is 'False', which results in text content. + :return: dictionary having 'metadata' and 'content' keys. + 'content' has a str value and metadata has a dict type value. + ''' if not xmlContent: jsonOutput = parse1('all', filename, serverEndpoint) else: @@ -29,16 +38,29 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False): def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False): + ''' + Parses the content from buffer + :param string: Buffer value + :param serverEndpoint: Server endpoint. This is optional + :param xmlContent: Whether or not XML content be requested. + Default is 'False', which results in text content. + :return: + ''' if not xmlContent: status, response = callServer('put', serverEndpoint, '/rmeta/text', string, {'Accept': 'application/json'}, False) else: status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, - {'Accept': 'application/json'}, False) - + {'Accept': 'application/json'}, False) + return _parse((status,response)) def _parse(jsonOutput): + ''' + Parses JSON response from Tika REST API server + :param jsonOutput: JSON output from Tika Server + :return: a dictionary having 'metadata' and 'content' values + ''' parsed={} if not jsonOutput: return parsed @@ -62,7 +84,7 @@ def _parse(jsonOutput): if n != "X-TIKA:content": if n in parsed["metadata"]: if not isinstance(parsed["metadata"][n], list): - parsed["metadata"][n] = [parsed["metadata"][n]] + parsed["metadata"][n] = [parsed["metadata"][n]] parsed["metadata"][n].append(js[n]) else: parsed["metadata"][n] = js[n] diff --git a/tika/tika.py b/tika/tika.py index 88e08a0..1a3623a 100644 --- a/tika/tika.py +++ b/tika/tika.py @@ -14,7 +14,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# + +# Module documentation +''' +Tika Python module provides Python API client to Aapche Tika Server. + +**Example usage**:: + + import tika + from tika import parser + parsed = parser.from_file('/path/to/file') + print(parsed["metadata"]) + print(parsed["content"]) + +Visit https://github.com/chrismattmann/tika-python to learn more about it. + +**Detect IANA MIME Type**:: + + from tika import detector + print(detector.from_file('/path/to/file')) + +**Detect Language**:: + + from tika import language + print(language.from_file('/path/to/file')) + +**Use Tika Translate**:: + + from tika import translate + print(translate.from_file('/path/to/file', 'srcLang', 'destLang') + # Use auto Language detection feature + print(translate.from_file('/path/to/file', 'destLang') + + +''' USAGE = """ tika.py [-v] [-e] [-o ] [--server ] [--install ] [--port ]