/**
* Copyright (c) 2013/2014 Verein zur Foerderung der IT-Sicherheit in Oesterreich (SBA).
* The work has been developed in the TIMBUS Project and the above-mentioned are Members of the TIMBUS Consortium.
* TIMBUS is supported by the European Union under the 7th Framework Programme for research and technological
* development and demonstration activities (FP7/2007-2013) under grant agreement no. 269940.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including without
* limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTIBITLY, or FITNESS FOR A PARTICULAR
* PURPOSE. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise,
* unless required by applicable law or agreed to in writing, shall any Contributor be liable for damages, including
* any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this
* License or out of the use or inability to use the Work.
* See the License for the specific language governing permissions and limitation under the License.
*/
package org.sba_research.timbus.kb.importer;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.OntModelSpec;
import com.hp.hpl.jena.query.*;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import org.sba_research.timbus.kb.Utils;
import org.sbaresearch.owl.OwlApiFacade;
import org.sbaresearch.owl.OwlElementNotFoundException;
import org.semanticweb.owlapi.model.IRI;
import org.semanticweb.owlapi.model.OWLNamedIndividual;
import org.semanticweb.owlapi.model.OWLOntologyStorageException;
import org.semanticweb.owlapi.vocab.OWL2Datatype;
import uk.ac.manchester.cs.owl.owlapi.OWL2DatatypeImpl;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
/**
* Uses Jena to perform queries on the ontology, because involving a reasoner is way to slow.
*/
public class PronomImporter implements DataImporter {
private static final Logger LOG = Logger.getLogger(PronomImporter.class.getName());
private static final String TMP_FILENAME = "toolKB_instance.owl";
private OwlApiFacade owl;
private String kb;
@Override
public void populate(OwlApiFacade owl, String kb) throws IOException, OwlElementNotFoundException, DataImporterException {
LOG.info("loading data...");
this.owl = owl;
this.kb = kb;
File formats = new File("cache_pronom_formats.json");
File tools = new File("cache_pronom_tools.json");
recreateCache(formats, tools);
try {
populateWith(loadCache(formats));
} catch (OWLOntologyStorageException e) {
throw new DataImporterException(e);
}
}
private ResultSet loadCache(File file) throws IOException {
FileInputStream inputStream = new FileInputStream(file);
ResultSet resultSet = ResultSetFactory.fromJSON(inputStream);
inputStream.close();
return resultSet;
}
private void recreateCache(File formats, File tools) throws IOException {
String sparqlQueryString = "select distinct ?name ?ext ?puid ?xpuid ?mimetype ?type where {\n" +
" ?s <http://www.w3.org/2000/01/rdf-schema#label> ?name .\n" +
" ?s <http://reference.data.gov.uk/technical-registry/extension> ?ext .\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/PUID> ?puid . }\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/XPUID> ?xpuid . }\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/MIMETYPE> ?mimetype . }\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/formatType> ?type. }\n" +
"}\n";// +
// "limit 250";
if (!formats.exists()) {
LOG.info("Querying endpoint...");
queryToFile(formats, sparqlQueryString);
}
/*
sparqlQueryString = "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n" +
" CONSTRUCT\n" +
" { ?s <http://www.w3.org/2000/01/rdf-schema#label> ?var_1 .\n" +
" ?s <http://reference.data.gov.uk/technical-registry/PUID> ?var_2 .\n" +
" ?s <http://reference.data.gov.uk/technical-registry/XPUID> ?var_3 .\n" +
" ?s <http://purl.org/dc/elements/1.1/description> ?var_4 .\n" +
" ?s <http://reference.data.gov.uk/technical-registry/version> ?var_5 .\n" +
" }\n" +
" WHERE\n" +
" {\n" +
" \n" +
" ?s ?p ?o .\n" +
" OPTIONAL { ?s <http://www.w3.org/2000/01/rdf-schema#label> ?var_1 . }\n" +
"\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/PUID> ?var_2 . }\n" +
"\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/XPUID> ?var_3 . }\n" +
"\n" +
" OPTIONAL { ?s <http://purl.org/dc/elements/1.1/description> ?var_4 . }\n" +
"\n" +
" OPTIONAL { ?s <http://reference.data.gov.uk/technical-registry/version> ?var_5 . }\n" +
"}";
if (!tools.exists()) {
queryToFile(tools, sparqlQueryString);
}
*/
}
private void queryToFile(File file, String sparqlQueryString) throws IOException {
Query query = QueryFactory.create(sparqlQueryString);
QueryExecution queryExecution = QueryExecutionFactory.sparqlService("http://test.linkeddatapronom.nationalarchives.gov.uk/sparql/endpoint.php", query);
ResultSet results = queryExecution.execSelect();
FileOutputStream outputStream = new FileOutputStream(file);
ResultSetFormatter.outputAsJSON(outputStream, results);
outputStream.close();
queryExecution.close();
}
private void populateWith(ResultSet results) throws DataImporterException, OwlElementNotFoundException, OWLOntologyStorageException {
LOG.info("populating owl...");
Model jenaModel = getJenaModel();
while (results.hasNext()) {
QuerySolution result = results.next();
String ext = Utils.cleanExtension(result.getLiteral("ext").toString());
String puid = result.getLiteral("puid").toString();
if (puid.equals("fmt/null")) {
puid = result.getLiteral("xpuid").toString();
}
if (ext.isEmpty() && puid.isEmpty()) {
LOG.severe("Neither extension nur PUID are set.");
continue;
}
Set<OWLNamedIndividual> pronomFormats = findFormat(jenaModel, ext, puid);
for (OWLNamedIndividual indiv : pronomFormats) {
try {
OWLNamedIndividual registry = safeAddRegistry(indiv);
if (result.contains("puid") || result.contains("xpuid"))
{
if (puid.equals("fmt/null")) {
LOG.warning("Ignoring puid=fmt/null for: " + getName(indiv));
} else {
addEntryToRegistry(getName(indiv), "id", puid, registry);
}
}
if (result.contains("mimetype")) {
addEntryToRegistry(getName(indiv), "mimetype", result.getLiteral("mimetype").getString(), registry);
}
} catch (OwlElementNotFoundException e) {
throw new DataImporterException(e);
}
}
}
}
private Model getJenaModel() throws OWLOntologyStorageException {
owl.save(TMP_FILENAME);
OntModel model = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM);
return model.read(TMP_FILENAME, "RDF/XML");
}
private String getName(OWLNamedIndividual indiv) {
return indiv.getIRI().toString().substring(indiv.getIRI().toString().indexOf("#") + 1);
}
/**
* Try to map Pronom formats to Freebase formats by Pronom ID, fallback to a mapping by file extension.
*/
private Set<OWLNamedIndividual> findFormat(Model model, String ext, String puid) throws DataImporterException, OwlElementNotFoundException {
Set<OWLNamedIndividual> indivs = toIndivSet(findFormatByPuid(model, puid));
if (indivs.isEmpty()) {
indivs.addAll(toIndivSet(findFormatByExtension(model, ext)));
}
return indivs;
}
private Set<OWLNamedIndividual> toIndivSet(QueryExecution qe) {
ResultSet results = qe.execSelect();
HashSet<OWLNamedIndividual> individuals = new HashSet<>();
try {
for (; results.hasNext(); ) {
QuerySolution solution = results.nextSolution();
try {
individuals.add(owl.getIndividual(OwlApiFacade.getFragment(IRI.create(solution.get("format").toString()))));
} catch (OwlElementNotFoundException e) {
LOG.severe(solution.get("format").toString());
}
}
} finally {
qe.close();
}
return individuals;
}
private QueryExecution findFormatByExtension(Model model, String ext) {
String queryString = "" +
"PREFIX kbs: <http://timbus.teco.edu/ontologies/preservationIdentifier/toolKB.owl#>\n" +
"SELECT ?format\n" +
"WHERE {\n" +
" ?registryEntry a kbs:RegistryEntry .\n" +
" ?registryEntry kbs:hasKey ?key .\n" +
" ?registryEntry kbs:hasValue ?extension .\n" +
" ?registry a kbs:FormatRegistry .\n" +
" ?registry kbs:isConsistingOf ?registryEntry .\n" +
" ?format a kbs:FileFormat .\n" +
" ?format kbs:isIdentifiedBy ?registry .\n" +
" FILTER (str(?extension) = \"" + ext + "\" ).\n" +
" FILTER (str(?key) = \"extension\" ).\n" +
"}";
return QueryExecutionFactory.create(QueryFactory.create(queryString), model);
}
private QueryExecution findFormatByPuid(Model model, String puid) {
String queryString = "" +
"PREFIX kbs: <http://timbus.teco.edu/ontologies/preservationIdentifier/toolKB.owl#>\n" +
"SELECT ?format\n" +
"WHERE {\n" +
" ?registryEntry a kbs:RegistryEntry .\n" +
" ?registryEntry kbs:hasKey ?key .\n" +
" ?registryEntry kbs:hasValue ?puid .\n" +
" ?registry a kbs:FormatRegistry .\n" +
" ?registry kbs:isConsistingOf ?registryEntry .\n" +
" ?format a kbs:FileFormat .\n" +
" ?format kbs:isIdentifiedBy ?registry .\n" +
" FILTER (str(?puid) = \"" + puid + "\" ).\n" +
" FILTER (str(?key) = \"puid\" ).\n" +
"}";
return QueryExecutionFactory.create(QueryFactory.create(queryString), model);
}
private OWLNamedIndividual safeAddRegistry(OWLNamedIndividual format) throws OwlElementNotFoundException {
String registryName = "registry_format_" + getName(format) + "_pronom";
if (!owl.containsIndividual(registryName)) {
OWLNamedIndividual registry = owl.addIndividual(registryName, kb + "#FormatRegistry");
owl.addObjectProperty(format, kb + "#isIdentifiedBy", registry);
}
return owl.getIndividual(registryName);
}
private void addEntryToRegistry(String formatName, String key, String value, OWLNamedIndividual registry) {
OWLNamedIndividual registryEntry = owl.addIndividual("registry_format_" + formatName + "_pronom_" + key, kb + "#RegistryEntry");
owl.addDataProperty(registryEntry, kb + "#hasKey", owl.getOWLLiteral(key, OWL2DatatypeImpl.getDatatype(OWL2Datatype.XSD_STRING)));
owl.addDataProperty(registryEntry, kb + "#hasValue", owl.getOWLLiteral(value, OWL2DatatypeImpl.getDatatype(OWL2Datatype.XSD_STRING)));
owl.addObjectProperty(registry, kb + "#isConsistingOf", registryEntry);
}
}