1 module symmetry.api.tika; 2 3 unittest 4 { 5 import std.conv : text; 6 import std.net.curl : download; 7 import std.string : lastIndexOf; 8 import std.file : exists; 9 10 auto testFileUrl = "https://github.com/Laeeth/docshare/raw/master/paretian.pdf"; 11 auto filenameIndex = testFileUrl.lastIndexOf("/"); 12 assert(filenameIndex > -1); 13 auto filename = "." ~ testFileUrl[filenameIndex .. $]; 14 if(!filename.exists) 15 download(testFileUrl, filename); 16 TikaServer tikaServer; 17 assert(tikaServer.detectType(filename).value == "application/pdf"); 18 auto meta = tikaServer.extractMetaData(filename); 19 assert(meta.success, "failed to extract metadata" ~ "\n" ~ meta.value.text); 20 auto res = tikaServer.convertBulkTo([filename]); 21 assert(meta.value["title"] == "THE BEST AND THE REST: REVISITING THE NORM OF NORMALITY OF INDIVIDUAL PERFORMANCE"); 22 } 23 24 struct TikaResult 25 { 26 import requests : Response; 27 28 int responseCode; 29 bool success = false; 30 string value; 31 32 private this(Response response) 33 { 34 success = (response.code == 200); 35 responseCode = response.code; 36 value = (cast(char[]) response.responseBody.data).idup; 37 } 38 } 39 40 struct TikaMetaData 41 { 42 import requests : Response; 43 44 int responseCode; 45 bool success = false; 46 string[string] value; 47 48 private this(Response response) 49 { 50 import std.string : splitLines, split; 51 success = (response.code == 200); 52 responseCode = response.code; 53 auto lines = (cast(char[])response.responseBody.data).idup 54 .splitLines; 55 56 foreach(line;lines) 57 { 58 auto cols = line.split(','); 59 value[cols[0].unQuote] = cols[1].unQuote; 60 } 61 } 62 } 63 64 struct TikaServer 65 { 66 enum url_tika = "tika"; 67 enum url_meta = "meta"; 68 enum url_detect = "detect/stream"; 69 enum url_detectors = "detectors"; 70 enum url_mimetypes = "mime-types"; 71 72 string url = "http://127.0.0.1:9998"; 73 int timeoutSeconds = 60; 74 75 private auto doRequestFromFile(string urlPath, string filename, 76 string[string] headers = [ "Accept":"text/plain" ]) 77 { 78 import std.stdio : File; 79 80 return doRequestFromData(urlPath, filename.File.byChunk(1024),headers); 81 } 82 83 private auto doRequestFromData(S)(string urlPath, S input, 84 string[string] headers = [ "Accept":"text/plain" ]) 85 { 86 import requests : Request; 87 import core.time : seconds; 88 89 auto rq = Request(); 90 rq.addHeaders(headers); 91 92 rq.timeout = timeoutSeconds.seconds; 93 return rq.exec!"PUT"(url ~ '/' ~ urlPath, input); 94 } 95 96 TikaMetaData extractMetaData(string filename, 97 string[string] headers = (string[string]).init) 98 { 99 return doRequestFromFile(url_meta, filename,headers) 100 .TikaMetaData; 101 } 102 103 TikaResult[] convertBulkTo(string[] filenames, 104 string[string] headers = [ "Accept":"text/plain" ]) 105 { 106 import std.algorithm : map; 107 import std.array : array; 108 109 return filenames.map!(filename => convertTo(filename,headers)).array; 110 } 111 112 TikaResult detectType(string filename, 113 string[string] headers = (string[string]).init) 114 { 115 return doRequestFromFile(url_detect, filename,headers) 116 .TikaResult; 117 } 118 119 TikaResult convertTo(string filename, 120 string[string] headers = [ "Accept":"text/plain" ]) 121 { 122 return doRequestFromFile(url_tika, filename,headers) 123 .TikaResult; 124 } 125 126 TikaResult convertStringTo(string inputString, 127 string[string] headers = [ "Accept":"text/plain" ]) 128 { 129 return doRequestFromData(url_tika, inputString,headers) 130 .TikaResult; 131 } 132 } 133 134 private string unQuote(string s) 135 { 136 import std.string : strip; 137 138 s = s.strip; 139 if (s.length > 2 && (s[0] == '\"')) 140 s = s[1 .. $]; 141 if (s.length > 1 && (s[$-1] == '\"')) 142 s = s[0 .. $-1]; 143 return s; 144 } 145