Skip to content

Commit

Permalink
Merge pull request #138 from USCDataScience/docs
Browse files Browse the repository at this point in the history
Added documentation to methods
  • Loading branch information
chrismattmann authored Apr 5, 2017
2 parents b5bd593 + 716a223 commit 4590b23
Show file tree
Hide file tree
Showing 6 changed files with 313 additions and 26 deletions.
10 changes: 10 additions & 0 deletions tika/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,20 @@
from .tika import detectType1, callServer, ServerEndpoint

def from_file(filename):
'''
Detects MIME type of specified file
:param filename: file whose type needs to be detected
:return: MIME type
'''
jsonOutput = detectType1('type', filename)
return jsonOutput[1]

def from_buffer(string):
'''
Detects MIME type of the buffered content
:param string: buffered content whose type needs to be detected
:return:
'''
status, response = callServer('put', ServerEndpoint, '/detect/stream', string,
{'Accept': 'text/plain'}, False)
return response
10 changes: 10 additions & 0 deletions tika/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,20 @@
from .tika import detectLang1, callServer, ServerEndpoint

def from_file(filename):
'''
Detects language of the file
:param filename: path to file whose language needs to be detected
:return:
'''
jsonOutput = detectLang1('file', filename)
return jsonOutput[1]

def from_buffer(string):
'''
Detects language of content in the buffer
:param string: buffered data
:return:
'''
status, response = callServer('put', ServerEndpoint, '/language/string', string,
{'Accept': 'text/plain'}, False)
return response
28 changes: 25 additions & 3 deletions tika/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@
import json

def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False):
'''
Parses a file for metadata and content
:param filename: path to file which needs to be parsed
:param serverEndpoint: Server endpoint url
:param xmlContent: Whether or not XML content be requested.
Default is 'False', which results in text content.
:return: dictionary having 'metadata' and 'content' keys.
'content' has a str value and metadata has a dict type value.
'''
if not xmlContent:
jsonOutput = parse1('all', filename, serverEndpoint)
else:
Expand All @@ -29,16 +38,29 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False):


def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False):
'''
Parses the content from buffer
:param string: Buffer value
:param serverEndpoint: Server endpoint. This is optional
:param xmlContent: Whether or not XML content be requested.
Default is 'False', which results in text content.
:return:
'''
if not xmlContent:
status, response = callServer('put', serverEndpoint, '/rmeta/text', string,
{'Accept': 'application/json'}, False)
else:
status, response = callServer('put', serverEndpoint, '/rmeta/xml', string,
{'Accept': 'application/json'}, False)
{'Accept': 'application/json'}, False)

return _parse((status,response))

def _parse(jsonOutput):
'''
Parses JSON response from Tika REST API server
:param jsonOutput: JSON output from Tika Server
:return: a dictionary having 'metadata' and 'content' values
'''
parsed={}
if not jsonOutput:
return parsed
Expand All @@ -62,7 +84,7 @@ def _parse(jsonOutput):
if n != "X-TIKA:content":
if n in parsed["metadata"]:
if not isinstance(parsed["metadata"][n], list):
parsed["metadata"][n] = [parsed["metadata"][n]]
parsed["metadata"][n] = [parsed["metadata"][n]]
parsed["metadata"][n].append(js[n])
else:
parsed["metadata"][n] = js[n]
Expand Down
Loading

0 comments on commit 4590b23

Please sign in to comment.