br-crawl.cpp
5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright 2014 Noblis *
* *
* Licensed under the Apache License, Version 2.0 (the "License"); *
* you may not use this file except in compliance with the License. *
* You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#include <QtCore>
static void help()
{
printf("br-crawl [URL] [args]\n"
"=====================\n"
"* __stdin__ - URLs\n"
"* __stdout__ - Image URLs/JSON\n"
"\n"
"_br-crawl_ conducts a recursive descent search for images from a root URL.\n"
"Crawl will read root URLs from _stdin_ if none are provided.\n"
"Crawl writes every discovered image URL in a new line to _stdout_.\n"
"Arguments specifiying the duration of crawl are on a per-root-URL basis.\n"
"\n"
"Crawl identifies image URLs based on known image file extensions like `.png`.\n"
"Crawl is not expected to verify that URLs are images and may produce false positives.\n"
"\n"
"Optional Arguments\n"
"------------------\n"
"* -auto - Crawl chooses its own root URL (must be specified otherwise).\n"
"* -depth <int> - The levels to recursively search (unlimited otherwise).\n"
"* -depthFirst - Depth-first search (breadth-first otherwise).\n"
"* -help - Print usage information.\n"
"* -images <int> - The number of image URLs to obtain (unlimited otherwise).\n"
"* -json - Output JSON instead or URLs.\n"
"* -time <int> - The seconds to spend searching for images (unlimited otherwise).\n");
}
static const char *root = NULL;
static bool autoRoot = false;
static int depth = INT_MAX;
static bool depthFirst = false;
static int images = INT_MAX;
static bool json = false;
static int timeLimit = INT_MAX;
static QTime elapsed;
static int currentImages = 0;
static void crawl(QFileInfo url, int currentDepth = 0)
{
if ((currentImages >= images) || (currentDepth >= depth) || (elapsed.elapsed()/1000 >= timeLimit))
return;
if (url.filePath().startsWith("file://"))
url = QFileInfo(url.filePath().mid(7));
if (url.isDir()) {
const QDir dir(url.absoluteFilePath());
const QFileInfoList files = dir.entryInfoList(QDir::Files);
const QFileInfoList subdirs = dir.entryInfoList(QDir::Dirs | QDir::NoDotAndDotDot);
foreach (const QFileInfo &first, depthFirst ? subdirs : files)
crawl(first, currentDepth + 1);
foreach (const QFileInfo &second, depthFirst ? files : subdirs)
crawl(second, currentDepth + 1);
} else if (url.isFile()) {
const QString suffix = url.suffix();
if ((suffix == "bmp") || (suffix == "jpg") || (suffix == "jpeg") || (suffix == "png") || (suffix == "tiff")) {
printf(json ? "{ \"URL\" : \"file://%s\" }\n" : "file://%s\n", qPrintable(url.canonicalFilePath()));
fflush(stdout);
currentImages++;
}
}
}
int main(int argc, char *argv[])
{
for (int i=1; i<argc; i++) {
if (!strcmp(argv[i], "-auto" )) autoRoot = true;
else if (!strcmp(argv[i], "-depth" )) depth = atoi(argv[++i]);
else if (!strcmp(argv[i], "-depthFirst")) depthFirst = true;
else if (!strcmp(argv[i], "-help" )) { help(); exit(EXIT_SUCCESS); }
else if (!strcmp(argv[i], "-images" )) images = atoi(argv[++i]);
else if (!strcmp(argv[i], "-json" )) json = true;
else if (!strcmp(argv[i], "-time" )) timeLimit = atoi(argv[++i]);
else root = argv[i];
}
elapsed.start();
if (root != NULL) {
crawl(QFileInfo(root));
} else {
if (autoRoot) {
foreach (const QString &path, QStandardPaths::standardLocations(QStandardPaths::HomeLocation))
crawl(path);
} else {
QFile file;
file.open(stdin, QFile::ReadOnly);
while (!file.atEnd()) {
const QString url = QString::fromLocal8Bit(file.readLine()).simplified();
if (!url.isEmpty())
crawl(url);
}
}
}
return EXIT_SUCCESS;
}