1 module symmetry.api.tika;
2 
3 unittest
4 {
5 	import std.conv : text;
6 	import std.net.curl : download;
7 	import std.string : lastIndexOf;
8 	import std.file : exists;
9 
10 	auto testFileUrl = "https://github.com/Laeeth/docshare/raw/master/paretian.pdf";
11 	auto filenameIndex = testFileUrl.lastIndexOf("/");
12 	assert(filenameIndex > -1);
13 	auto filename = "." ~ testFileUrl[filenameIndex .. $];
14 	if(!filename.exists)
15 		download(testFileUrl, filename);
16 	TikaServer tikaServer;
17 	assert(tikaServer.detectType(filename).value == "application/pdf");
18 	auto meta = tikaServer.extractMetaData(filename);
19 	assert(meta.success, "failed to extract metadata" ~ "\n" ~ meta.value.text);
20 	auto res = tikaServer.convertBulkTo([filename]);
21 	assert(meta.value["title"] == "THE BEST AND THE REST: REVISITING THE NORM OF NORMALITY OF INDIVIDUAL PERFORMANCE");
22 }
23 
24 struct TikaResult
25 {
26 	import requests : Response;
27 
28 	int responseCode;
29 	bool success = false;
30 	string value;
31 
32 	private this(Response response)
33 	{
34 		success = (response.code == 200);
35 		responseCode = response.code;
36 		value = (cast(char[]) response.responseBody.data).idup;
37 	}
38 }
39 
40 struct TikaMetaData
41 {
42 	import requests : Response;
43 
44 	int responseCode;
45 	bool success = false;
46 	string[string] value;
47 
48 	private this(Response response)
49 	{
50 		import std.string : splitLines, split;
51 		success = (response.code == 200);
52 		responseCode = response.code;
53 		auto lines = (cast(char[])response.responseBody.data).idup
54 			.splitLines;
55 
56 		foreach(line;lines)
57 		{
58 			auto cols = line.split(',');
59 			value[cols[0].unQuote] = cols[1].unQuote;
60 		}
61 	}
62 }
63 
64 struct TikaServer
65 {
66 	enum url_tika = "tika";
67 	enum url_meta = "meta";
68 	enum url_detect = "detect/stream";
69 	enum url_detectors = "detectors";
70 	enum url_mimetypes = "mime-types";
71 
72 	string url = "http://127.0.0.1:9998";
73 	int timeoutSeconds = 60;
74 
75 	private auto doRequestFromFile(string urlPath, string filename,
76 			string[string] headers = [ "Accept":"text/plain" ])
77 	{
78 		import std.stdio : File;
79 
80 		return doRequestFromData(urlPath, filename.File.byChunk(1024),headers);
81 	}
82 
83 	private auto doRequestFromData(S)(string urlPath, S input,
84 			string[string] headers = [ "Accept":"text/plain" ])
85 	{
86 		import requests : Request;
87 		import core.time : seconds;
88 
89 		auto rq = Request();
90 		rq.addHeaders(headers);
91 
92 		rq.timeout = timeoutSeconds.seconds;
93 		return rq.exec!"PUT"(url ~ '/' ~ urlPath, input);
94 	}
95 
96 	TikaMetaData extractMetaData(string filename,
97 			string[string] headers = (string[string]).init)
98 	{
99 		return doRequestFromFile(url_meta, filename,headers)
100 			.TikaMetaData;
101 	}
102 
103 	TikaResult[] convertBulkTo(string[] filenames,
104 			string[string] headers = [ "Accept":"text/plain" ])
105 	{
106 		import std.algorithm : map;
107 		import std.array : array;
108 
109 		return filenames.map!(filename => convertTo(filename,headers)).array;
110 	}
111 
112 	TikaResult detectType(string filename,
113 			string[string] headers = (string[string]).init)
114 	{
115 		return doRequestFromFile(url_detect, filename,headers)
116 			.TikaResult;
117 	}
118 
119 	TikaResult convertTo(string filename,
120 			string[string] headers = [ "Accept":"text/plain" ])
121 	{
122 		return doRequestFromFile(url_tika, filename,headers)
123 			.TikaResult;
124 	}
125 
126 	TikaResult convertStringTo(string inputString,
127 			string[string] headers = [ "Accept":"text/plain" ])
128 	{
129 		return doRequestFromData(url_tika, inputString,headers)
130 			.TikaResult;
131 	}
132 }
133 
134 private string unQuote(string s)
135 {
136 	import std.string : strip;
137 
138 	s = s.strip;
139 	if (s.length > 2 && (s[0] == '\"'))
140 		s = s[1 .. $];
141 	if (s.length > 1 && (s[$-1] == '\"'))
142 		s = s[0 .. $-1];
143 	return s;
144 }
145