Parent: [346c31] (diff)

Download this file

ProvenanceAccessClient.java    466 lines (377 with data), 18.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
package net.timbusproject.extractors.modules.tavernaextractor.provenance;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import net.sf.taverna.raven.appconfig.ApplicationUserHome;
import net.sf.taverna.t2.invocation.InvocationContext;
import net.sf.taverna.t2.provenance.api.ProvenanceAccess;
import net.sf.taverna.t2.provenance.api.ProvenanceConnectorType;
import net.sf.taverna.t2.provenance.connector.ProvenanceConnector;
import net.sf.taverna.t2.provenance.lineageservice.utils.PortBinding;
import net.sf.taverna.t2.provenance.lineageservice.utils.ProvenanceProcessor;
import net.sf.taverna.t2.provenance.lineageservice.utils.WorkflowRun;
import net.sf.taverna.t2.reference.ExternalReferenceSPI;
import net.sf.taverna.t2.reference.Identified;
import net.sf.taverna.t2.reference.ReferenceServiceException;
import net.sf.taverna.t2.reference.ReferenceSet;
import net.sf.taverna.t2.reference.T2Reference;
import net.sf.taverna.t2.reference.ValueCarryingExternalReference;
import net.timbusproject.extractors.modules.tavernaextractor.CSVEntry;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.bean.ColumnPositionMappingStrategy;
import au.com.bytecode.opencsv.bean.CsvToBean;
//import uk.gov.nationalarchives.droid.command.DroidCommandLine;
//import uk.gov.nationalarchives.droid.command.action.CommandLineException;
/**
* Class for accessing provenance data generated by a Taverna Workflow Run, which is stored in the standard Derby
* database.
* <p/>
* Tested with Taverna 2.4.0.
*
* @author munterberger
*/
public class ProvenanceAccessClient {
private static Logger LOGGER = LogManager.getLogger("ProvenanceAccessClient");
private final String TAVERNA_VERSION = "2.4.0";
private final String defaultPathToDerbyDatabase = new ApplicationUserHome("taverna-" + TAVERNA_VERSION)
.getAppUserHome() + File.separator + "t2-database";
private String t2Databasepath;
private Map<Integer, ProvenanceAccessEntry> workflowMap = new HashMap<Integer, ProvenanceAccessEntry>();
private DerbyProvenanceClient c;
private InvocationContext ic;
private ProvenanceAccess pa;
private ProvenanceConnector pc;
private boolean setupAlreadyCalled = false;
public ProvenanceAccessClient() {
LOGGER.setLevel(Level.DEBUG);
this.t2Databasepath = defaultPathToDerbyDatabase;
}
public ProvenanceAccessClient(String t2Databasepath) {
this();
this.t2Databasepath = t2Databasepath;
}
// Setup routine
public void setUp() {
if (setupAlreadyCalled) {
return;
}
c = new DerbyProvenanceClient(t2Databasepath);
try {
c.setUp(ProvenanceConnectorType.DERBY);
ic = c.getInvocationContext();
pa = c.getProvenanceAccess();
pc = pa.getProvenanceConnector();
} catch (Exception e) {
LOGGER.error("ERROR: " + e.getMessage());
e.printStackTrace();
}
setupAlreadyCalled = true;
}
public ProvenanceAccess getProvenanceAccess() {
return pa;
}
/*
According DROID commandline usage:
" Command line usage does not currently allow all operations to be combined in a single
command-line. &nbsp;Some options simply print information to the screen, such as the version
of DROID. &nbsp;For the others, you can create new profiles and save the results in one step.
To filter, report or export a profile requires a second step, opening a previously
saved profile to process. "
*/
public Path processWorkflow(Integer id, final String droidOutputFilePath) throws ProvenanceAccessException,
ReferenceServiceException, SQLException, IOException {
if (id == null) {
throw new IllegalArgumentException("ID must not be null");
}
// if (droidOutputFilePath == null || droidOutputFilePath.isEmpty()) {
// throw new IllegalArgumentException("Problems accessing the DROID output file at " + droidOutputFilePath);
// }
final File droidOutputFile = new File(droidOutputFilePath);
if (!droidOutputFile.exists()) {
droidOutputFile.createNewFile();
LOGGER.debug("DROID profile file successfully created! " + droidOutputFile.exists());
} else {
LOGGER.debug("DROID profile file already exists! File will be overwritten.");
}
// call the setup routine
setUp();
// get internal ID.
ProvenanceAccessEntry entry = workflowMap.get(id);
if (entry == null) {
// remove file
droidOutputFile.delete();
throw new ProvenanceAccessException("ERROR: No entry found for ID " + id + ".");
}
String workflowRunID = entry.getWorkflowRunId();
String workflowID = entry.getWorkflowId();
LOGGER.info("Get provenance data for workflow [ID = " + workflowID + "] and run [ID = " + workflowRunID + "].");
String fileName = new String();
//final Path tempDir = Files.createTempDirectory("workflowRunID-" + workflowRunID + "-");
final Path tempDir = droidOutputFile.toPath();
// all processors
List<ProvenanceProcessor> provenanceProcesor = pa.getProcessorsForWorkflowID(workflowID);
LOGGER.debug("Following processor are found");
for (ProvenanceProcessor processor : provenanceProcesor) {
LOGGER.debug("\t '" + processor.getProcessorName() + "'");
}
HashMap<String, String> queryConstraints = new HashMap<String, String>();
queryConstraints.put("V.workflowId", workflowID);
for (PortBinding binding : pa.getPortBindings(queryConstraints)) {
LOGGER.debug("Found binding for port= " + binding.getPortName());
// Only interested in results from a specific workflow run
if (binding.getWorkflowRunId().equals(workflowRunID)) {
T2Reference ref = null;
try {
ref = ic.getReferenceService().referenceFromString(binding.getValue());
} catch (ReferenceServiceException rse) {
LOGGER.debug("\t Error trying to fetch T2Reference (" + binding.getValue() + ") from t2-database ("
+ t2Databasepath + "). No further treatment.");
break; // no further treatment!
}
Identified identified = pc.getReferenceService().resolveIdentifier(ref, null, ic);
if (identified instanceof ReferenceSet) {
LOGGER.info("try to fetch reference set for binding "+binding.getPortName());
ReferenceSet referenceSet = (ReferenceSet) identified;
Set<ExternalReferenceSPI> externalReferences = referenceSet.getExternalReferences();
for (ExternalReferenceSPI externalReference : externalReferences) {
if (externalReference instanceof ValueCarryingExternalReference<?>) {
ValueCarryingExternalReference<?> vcer = (ValueCarryingExternalReference<?>) externalReference;
// string ? filepath or URL
if (String.class.isAssignableFrom(vcer.getValueType())) {
String possibleFilePath = (String) vcer.getValue();
// valid file ?
File file = isFile(possibleFilePath);
if (file != null && file.length() > 0) {
LOGGER.debug("\t Found file (" + file.getPath() + ") for port= "
+ binding.getPortName());
fileName += "[" + binding.getProcessorName() + "][" + binding.getPortName() + "]-";
File tmpFile = File.createTempFile(fileName, ".file", tempDir.toFile());
try {
FileUtils.copyFile(file, tmpFile);
} catch (IOException e) {
LOGGER.error("Error during fetching file. " + e.getMessage());
}
fileName = "";
}
// valid URL ?
URL url = isURL(possibleFilePath);
if (url != null) {
LOGGER.debug("\t Found valid URL (" + possibleFilePath + ") for port = "
+ binding.getPortName());
// download file and copy it to temp directory
fileName += "[" + binding.getProcessorName() + "][" + binding.getPortName() + "]-";
File content = File.createTempFile(fileName, ".url", tempDir.toFile());
FileUtils.copyURLToFile(url, content);
LOGGER.debug("\t Created temporary file (" + content.getAbsolutePath() + ")");
fileName = "";
}
}
// file as byte array ?
if (byte[].class.isAssignableFrom(vcer.getValueType())) {
LOGGER.debug("\t Found byte[] for port= " + binding.getPortName());
fileName += "[" + binding.getPortName() + "]-";
byte[] content = (byte[]) vcer.getValue();
File tmpFile = File.createTempFile(fileName, ".binary", tempDir.toFile());
FileUtils.writeByteArrayToFile(tmpFile, content);
LOGGER.debug("\t Create " + fileName + " file for port= " + binding.getPortName());
// reset file name
fileName = "";
}
}
}
}
}
}
return tempDir;
// final String dir = tempDir.toString();
// Thread t = new Thread() {
// public void run() {
// String[] args = {"-a", dir, "-p", droidOutputFilePath};
// StringBuilder info = new StringBuilder();
// info.append("Running DROID identification with following arguments: ");
// for (String arg : args) {
// info.append(" " + arg + ",");
// }
// LOGGER.debug(info);
// try {
// DroidCommandLine.main(args);
// LOGGER.info("DROID profile successfully saved at "+droidOutputFilePath);
// } catch (CommandLineException e) {
// e.printStackTrace();
// }
// }
// };
// t.start();
}
/**
* Generates a CSV report out of a DROID profile file
*
* @param droidProfile
* @throws CommandLineException
*/
// public void generateReport(final File droidProfile) {
//
// final String dir = droidProfile.getParentFile().getAbsolutePath();
// final String fileName = droidProfile.getName();
//
// LOGGER.debug("Directory of the DROID profile file = " + dir);
//
// final File resultCSV = new File(dir + File.separator + fileName + ".csv");
//
// LOGGER.info("Generate CSV-report [" + resultCSV.getAbsolutePath() + "] out of profile ["
// + droidProfile.getAbsolutePath() + "].");
//
// Thread t = new Thread() {
// public void run() {
// String[] args = {"-p", droidProfile.getAbsolutePath(), "-e", resultCSV.getAbsolutePath()};
// StringBuilder info = new StringBuilder();
// info.append("Running DROID identification with following arguments: ");
// for (String arg : args) {
// info.append(" " + arg + ",");
// }
// LOGGER.debug(new String(info));
// try {
// DroidCommandLine.main(args);
// } catch (CommandLineException e) {
// e.printStackTrace();
// }
// }
// };
//
// t.start();
// }
/**
* Read a DROID generated CSV report and convert each row in the report to a CSVEntry.
*
* @param CSVReport file
* @return List with a CSVEntry for each row in the DROID report.
*/
@SuppressWarnings({"rawtypes", "unchecked"})
public List<CSVEntry> readCSV(File CSVReport) {
if (CSVReport == null) {
LOGGER.error("ERROR: CSVReport must not be null.");
throw new IllegalArgumentException("ERROR: CSVReport must not be null.");
}
CSVReader reader;
try {
reader = new CSVReader(new InputStreamReader(new FileInputStream(CSVReport.getAbsolutePath()), "UTF-8"),
',', '\'', 1);
} catch (UnsupportedEncodingException e) {
LOGGER.error("ERROR: File not found " + e.getMessage());
return Collections.EMPTY_LIST;
} catch (FileNotFoundException e) {
LOGGER.error("ERROR: File not found " + e.getMessage());
return Collections.EMPTY_LIST;
}
try {
ColumnPositionMappingStrategy strat = new ColumnPositionMappingStrategy();
strat.setType(CSVEntry.class);
strat.setColumnMapping(new String[]{"ID", "PARENT_ID", "URI", "FILE_PATH", "NAME", "METHOD", "STATUS",
"SIZE", "TYPE", "EXT", "LAST_MODIFIED", "EXTENSION_MISMATCH", "MD5_HASH", "FORMAT_COUNT", "PUID",
"MIME_TYPE", "FORMAT_NAME", "FORMAT_VERSION"});
CsvToBean csv = new CsvToBean();
List<CSVEntry> list = csv.parse(strat, reader);
// step over line1 in CSV
return list.subList(1, list.size());
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Make sure that each entry which has the same name and PUID only exists once.
*
* @return List without any double entries
*/
public List<CSVEntry> filterEntries(List<CSVEntry> original) {
List<CSVEntry> subList = new ArrayList<>();
for (CSVEntry entry : original) {
if (!subList.contains(entry)) {
subList.add(entry);
}
}
return subList;
}
public Map<Integer, ProvenanceAccessEntry> listAllWorkflowsReadable() {
setUp();
Integer interalID = 0;
for (WorkflowRun workflowRun : pa.getAllWorkflowIDs()) {
ProvenanceAccessEntry entry = new ProvenanceAccessEntry(workflowRun.getWorkflowId(),
workflowRun.getWorkflowRunId(), workflowRun.getWorkflowExternalName(), workflowRun.getTimestamp());
workflowMap.put(interalID, entry);
interalID++;
}
return workflowMap;
}
/**
* Dirty way of detecting if a string is a representation of a file path.
*
* @param possibleFilePath
* @return file if exists
*/
private File isFile(String possibleFilePath) {
LOGGER.debug("call isFile(" + possibleFilePath + ")");
LOGGER.debug("\t Detect if file path (" + possibleFilePath + ") point to a file. ");
try {
return Paths.get(possibleFilePath).toFile();
} catch (Exception e) {
LOGGER.debug("\t File can not be detected. " + e.getMessage());
return null;
}
// String regexPath = "([a-zA-Z]:)?(\\\\[a-zA-Z0-9_.-]+)+\\\\?";
//
// if (possibleFilePath != null && possibleFilePath.length() > 5 && possibleFilePath.contains(File.separator)
// && Pattern.matches(regexPath, possibleFilePath)) {
//
// LOGGER.debug("\t Detect if file path (" + possibleFilePath + ") point to a file. ");
// try {
// // can we create a file of the possibleFilePath
// File file = new File(possibleFilePath);
// if (!file.isDirectory()) {
// return file;
// }
//
// } catch (Exception e) {
// LOGGER.debug("\t File can not be detected. " + e.getMessage());
// return null;
// }
// }
}
private URL isURL(String possibleURL) {
LOGGER.debug("call isURL(" + possibleURL + ")");
String regexURL = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
if (possibleURL != null && possibleURL.length() > 5 && Pattern.matches(regexURL, possibleURL)) {
LOGGER.debug("\t Detect if (" + possibleURL + ") is a valid URL.");
try {
return new URL(possibleURL);
} catch (MalformedURLException e) {
LOGGER.debug("\t Not a valid URL. "+e.getMessage());
}
}
return null;
}
}