#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
#= Imports ====================================================================
"""
Aleph X-Service wrapper.
This module allows you to query Aleph's X-Services module (Aleph server is
defined by ALEPH_URL in settings.py).
There are two levels of abstraction:
Lowlevel
========
You can use this functions to access Aleph::
searchInAleph(base, phrase, considerSimilar, field)
getDocumentIDs(aleph_search_result, [number_of_docs])
downloadMARCXML(doc_id, library)
downloadMARCOAI(doc_id, base)
Workflow
********
Aleph works in strange way, that he won't allow you to access desired
information directly.
You have to create search request by calling searchInAleph() first, which
will return dictionary with few imporant informations about session.
This dictionary can be later used as parameter to function getDocumentIDs(),
which will give you list of DocumentID named tuples.
Named tuples are used, because to access your document, you won't need just
document ID number, but also library ID string.
Depending on your system, there may be just only one accessible library, or
mutiple ones, and then you will be glad, that you get both of this
informations together.
DocumentID can be used as parameter to :func:`downloadMARCXML`.
Lets look at some code::
ids = getDocumentIDs(searchInAleph("nkc", "test", False, "wrd"))
for id_num, library in ids:
XML = downloadMARCXML(id_num, library)
# processDocument(XML)
Highlevel
=========
So far, there are only getter wrappers::
getISBNsIDs()
getAuthorsBooksIDs()
getPublishersBooksIDs()
And counting functions (they are one request to aleph faster than just
counting results from getters)::
getISBNCount()
getAuthorsBooksCount()
getPublishersBooksCount()
Other noteworthy properties
===========================
List of valid bases can be obtained by calling :func:`_getListOfBases`, which
returns list of strings.
There is also defined exception tree - see :class:`AlephException` docstring
for details.
"""
from collections import namedtuple
from string import Template
from urllib import quote_plus
import dhtmlparser
from httpkie import Downloader
from settings import *
#= Variables ==================================================================
# String.Template() variable convention is used
ALEPH_SEARCH_URL_TEMPLATE = "/X?op=find&request=$FIELD=$PHRASE&base=$BASE&adjacent=$SIMILAR"
ALEPH_GET_SET_URL_TEMPLATE = "/X?op=ill_get_set&set_number=$SET_NUMBER&start_point=1&no_docs=$NUMBER_OF_DOCS"
ALEPH_GET_DOC_URL_TEMPLATE = "/X?op=ill_get_doc&doc_number=$DOC_ID&library=$LIBRARY"
ALEPH_GET_OAI_DOC_URL_TEMPLATE = "/X?op=find_doc&doc_num=$DOC_ID&base=$BASE"
VALID_ALEPH_FIELDS = [
"wrd", # slova ze všech popisných údaju
"wtl", # slova z názvových údajů (název, název části, edice, originál atd)
"wau", # slova z údajů o autorech
"wpb", # slova z údajů o nakladateli
"wpp", # slova z údajů o místě vydání
"wyr", # rok vydání
"wkw", # předmět (klíčová slova)
"sbn", # ISBN/ISMN
"ssn", # ISSN
"icz", # identifikační číslo záznamu
"cnb", # číslo ČNB
"sg" # signatura
]
dhtmlparser.NONPAIR_TAGS = [] # used for parsing XML - see documentation
#= Functions & objects ========================================================
[docs]class AlephException(Exception):
"""
Exception tree::
- AlephException
|- InvalidAlephBaseException
|- InvalidAlephFieldException
|- LibraryNotFoundException
`- DocumentNotFoundException
"""
def __init__(self, message):
Exception.__init__(self, message)
[docs]class InvalidAlephBaseException(AlephException):
def __init__(self, message):
super(InvalidAlephBaseException, self).__init__(message)
[docs]class InvalidAlephFieldException(AlephException):
def __init__(self, message):
super(InvalidAlephFieldException, self).__init__(message)
[docs]class LibraryNotFoundException(AlephException):
def __init__(self, message):
super(LibraryNotFoundException, self).__init__(message)
[docs]class DocumentNotFoundException(AlephException):
def __init__(self, message):
super(DocumentNotFoundException, self).__init__(message)
[docs]class DocumentID(namedtuple("DocumentID", ["id", "library", "base"])):
"""
This structure is used to store pointer to document in aleph.
Attributes:
id (int): id of document
library (str): can be different for each document
base (str): default "nkc", but really depends at what bases you have
defined in your aleph
"""
pass
def _getListOfBases():
"""
This function is here mainly for purposes of unittest
Returns:
list: valid bases as they are used as URL parameters in links at aleph
main page.
"""
downer = Downloader()
data = downer.download(ALEPH_URL + "/F/?func=file&file_name=base-list")
dom = dhtmlparser.parseString(data.lower())
# from default aleph page filter links containing local_base in their href
base_links = filter(
lambda x: "href" in x.params and "local_base" in x.params["href"],
dom.find("a")
)
# split links by & - we will need only XXX from link.tld/..&local_base=XXX
base_links = map(
lambda x: x.params["href"].replace("?", "&", 1).split("&"),
base_links
)
# filter only sections containing bases
bases = map(
lambda link: filter(lambda base: "local_base=" in base, link)[0],
base_links
)
# filter bases from base sections
bases = map(
lambda x: x.split("=")[1].strip(),
bases
)
return list(set(bases)) # list(set()) is same as unique()
def _tryConvertToInt(s):
"""
Try convert value from |s| to int.
Returns:
int(s): if the value was successfully converted, or `s` when conversion
failed.
"""
try:
return int(s)
except ValueError:
return s
def _alephResultToDict(dom):
"""
Convert part of non-nested XML to dict.
Args:
dom (HTMLElement tree): pre-parsed XML (see dhtmlparser).
Returns:
dict: with python data
"""
result = {}
for i in dom.childs:
if i.isOpeningTag():
keyword = i.getTagName().strip()
value = _tryConvertToInt(i.getContent().strip())
# if there are multiple tags with same keyword, add values into
# array, instead of rewriting existing value at given keyword
if keyword not in result: # if it is not in result, add it
result[keyword] = value
else: # if it is already there ..
if isinstance(result[keyword], list): # and it is list ..
result[keyword].append(value) # add it to list
else: # or make it array
result[keyword] = [result[keyword], value]
return result
[docs]def searchInAleph(base, phrase, considerSimilar, field):
"""
Send request to the aleph search engine.
Request itself is pretty useless, but it can be later used as parameter
for :func:`getAlephRecords`, which can fetch records from Aleph.
Args:
base (str): which database you want to use
phrase (str): what do you want to search
considerSimilar (bool): fuzzy search, which is not working at all, so
don't use it
field (str): where you want to look (see `VALID_ALEPH_FIELDS`)
Returns:
aleph_search_record, which is dictionary consisting from those fields:
| error (optional): present if there was some form of error
| no_entries (int): number of entries that can be fetch from aleph
| no_records (int): no idea what is this, but it is always >= than
`no_entries`
| set_number (int): important - something like ID of your request
| session-id (str): used to count users for licensing purposes
Example:
Returned dict::
{
'session-id': 'YLI54HBQJESUTS678YYUNKEU4BNAUJDKA914GMF39J6K89VSCB',
'set_number': 36520,
'no_records': 1,
'no_entries': 1
}
Raises:
AlephException: if Aleph doesn't return any information
InvalidAlephFieldException: if specified field is not valid
"""
downer = Downloader()
if field.lower() not in VALID_ALEPH_FIELDS:
raise InvalidAlephFieldException("Unknown field '" + field + "'!")
param_url = Template(ALEPH_SEARCH_URL_TEMPLATE).substitute(
PHRASE=quote_plus(phrase), # urlencode phrase
BASE=base,
FIELD=field,
SIMILAR="Y" if considerSimilar else "N"
)
result = downer.download(ALEPH_URL + param_url)
dom = dhtmlparser.parseString(result)
find = dom.find("find") # find <find> element :)
if len(find) <= 0:
raise AlephException("Aleph didn't returned any information.")
find = find[0]
# convert aleph result into dictionary
result = _alephResultToDict(find)
# add informations about base into result
result["base"] = base
if "error" in result:
if result["error"] == "empty set":
result["no_entries"] = 0 # empty set have 0 entries
return result
else:
raise AlephException(result["error"])
return result
[docs]def getDocumentIDs(aleph_search_result, number_of_docs=-1):
"""
Get IDs, which can be used as parameters for other functions.
Args:
aleph_search_result (dict): returned from :func:`searchInAleph`
number_of_docs (int, optional): how many :class:`DocumentIDs` from set
given by aleph_search_result should be returned,
default -1 for all of them.
Returns:
list: :class:`DocumentID` named tuples to given `aleph_search_result`.
Raises:
AlephException: if Aleph returns unknown format of data
Note:
Returned :class:`DocumentID` can be used as parameters to
:func:`downloadMARCXML`.
"""
downer = Downloader()
if "set_number" not in aleph_search_result:
return []
# set numbers should be probably aligned to some length
set_number = str(aleph_search_result["set_number"])
if len(set_number) < 6:
set_number = (6 - len(set_number)) * "0" + set_number
# limit number of fetched documents, if -1, download all
if number_of_docs <= 0:
number_of_docs = aleph_search_result["no_entries"]
# download data about given set
set_data = downer.download(
ALEPH_URL + Template(ALEPH_GET_SET_URL_TEMPLATE).substitute(
SET_NUMBER=set_number,
NUMBER_OF_DOCS=number_of_docs,
)
)
# parse data
dom = dhtmlparser.parseString(set_data)
set_data = dom.find("ill-get-set")
# there should be at least one <ill-get-set> field
if len(set_data) <= 0:
raise AlephException("Aleph didn't returned set data.")
ids = []
for library in set_data:
documents = _alephResultToDict(library)
# convert all document records to DocumentID named tuple and extend
# them to 'ids' array
if isinstance(documents["doc-number"], list):
ids.extend(
map(
lambda x: DocumentID(
x,
documents["set-library"],
aleph_search_result["base"]
),
set(documents["doc-number"])
)
)
else:
ids.append(
DocumentID(
documents["doc-number"],
documents["set-library"],
aleph_search_result["base"]
)
)
return ids
[docs]def downloadMARCXML(doc_id, library):
"""
Download MARC XML document with given `doc_id` from given `library`.
Args:
doc_id (DocumentID): you will get this from :func:`getDocumentIDs`
library (str): "NKC01" in our case, but don't worry,
:func:`getDocumentIDs` adds library specification into
:class:`DocumentID` named tuple.
Returns:
str: MARC XML unicode string.
Raises:
LibraryNotFoundException
DocumentNotFoundException
"""
downer = Downloader()
data = downer.download(
ALEPH_URL + Template(ALEPH_GET_DOC_URL_TEMPLATE).substitute(
DOC_ID=doc_id,
LIBRARY=library
)
)
dom = dhtmlparser.parseString(data)
# check if there are any errors
# bad library error
error = dom.find("login")
if len(error) > 0:
error = error[0].find("error")
if len(error) > 0:
raise LibraryNotFoundException(
"Can't download document doc_id: '" + str(doc_id) + "' " +
"(probably bad library: '" + library + "')!\nMessage: " +
error.getContent()
)
# another error - document not found
error = dom.find("ill-get-doc")
if len(error) > 0:
error = error[0].find("error")
if len(error) > 0:
raise DocumentNotFoundException(
error[0].getContent()
)
return data # MARCxml of document with given doc_id
[docs]def downloadMARCOAI(doc_id, base):
"""
Download MARC OAI document with given `doc_id` from given (logical) `base`.
Funny part is, that some documents can be obtained only with this function
in their full text.
Args:
doc_id (str): you will get this from :func:`getDocumentIDs`
base (str, optional): Base from which you want to download Aleph
document.
This seems to be duplicite with
:func:`searchInAleph` parameters, but it's just
somethin Aleph's X-Services wants, so ..
Returns:
str: MARC XML unicode string.
Raises:
InvalidAlephBaseException
DocumentNotFoundException
"""
downer = Downloader()
data = downer.download(
ALEPH_URL + Template(ALEPH_GET_OAI_DOC_URL_TEMPLATE).substitute(
DOC_ID=doc_id,
BASE=base
)
)
dom = dhtmlparser.parseString(data)
# check for errors
error = dom.find("error")
if len(error) > 0:
if "Error reading document" in error[0].getContent():
raise DocumentNotFoundException(
str(error[0].getContent())
)
else:
raise InvalidAlephBaseException(
error[0].getContent() + "\n" +
"The base you are trying to access probably doesn't exist."
)
return data
[docs]def getISBNsIDs(isbn, base=ALEPH_DEFAULT_BASE):
"""
Get list of :class:`DocumentID` objects of documents with given `isbn`.
Args:
isbn (str): ISBN string
base (str, optional): base on which will be search performed. Default
``settings.ALEPH_DEFAULT_BASE``
Returns:
list: of :class:`DocumentID` objects
"""
return getDocumentIDs(searchInAleph(base, isbn, False, "sbn"))
[docs]def getAuthorsBooksIDs(author, base=ALEPH_DEFAULT_BASE):
"""
Get list of :class:`DocumentID` objects of documents with given `author`.
Args:
author (str): Authors name/lastname in UTF
base (str, optional): base on which will be search performed. Default
``settings.ALEPH_DEFAULT_BASE``
Returns:
list: of :class:`DocumentID` objects
"""
return getDocumentIDs(searchInAleph(base, author, False, "wau"))
[docs]def getPublishersBooksIDs(publisher, base=ALEPH_DEFAULT_BASE):
"""
Get list of :class:`DocumentID` objects of documents with given
`publisher`.
Args:
publisher (str): name of publisher which will be used to search Aleph
base (str, optional): base on which will be search performed. Default
``settings.ALEPH_DEFAULT_BASE``
Returns:
list: of :class:`DocumentID` objects
"""
return getDocumentIDs(searchInAleph(base, publisher, False, "wpb"))
[docs]def getISBNCount(isbn, base=ALEPH_DEFAULT_BASE):
"""
Get number of records in Aleph which match given `isbn`.
Args:
isbn (str): ISBN string
base (str, optional): base on which will be search performed. Default
``settings.ALEPH_DEFAULT_BASE``
Returns:
int: Number of matching documents in Aleph.
"""
return searchInAleph(base, isbn, False, "sbn")["no_entries"]
[docs]def getAuthorsBooksCount(author, base=ALEPH_DEFAULT_BASE):
"""
Get number of records in Aleph which match given `author`.
Args:
isbn (str): Authors name/lastname in UTF
base (str, optional): base on which will be search performed. Default
``settings.ALEPH_DEFAULT_BASE``
Returns:
int: Number of matching documents in Aleph.
"""
return searchInAleph(base, author, False, "wau")["no_entries"]
[docs]def getPublishersBooksCount(publisher, base=ALEPH_DEFAULT_BASE):
"""
Get number of records in Aleph which match given `publisher`.
Args:
isbn (str): name of publisher which will be used to search Aleph
base (str, optional): base on which will be search performed. Default
``settings.ALEPH_DEFAULT_BASE``
Returns:
int: Number of matching documents in Aleph.
"""
return searchInAleph(base, publisher, False, "wpb")["no_entries"]