/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Copyright 2014 Noblis * * * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include static void help() { printf("br-crawl [URL] [args]\n" "=====================\n" "* __stdin__ - URLs\n" "* __stdout__ - Image URLs/JSON\n" "\n" "_br-crawl_ conducts a recursive descent search for images from a root URL.\n" "Crawl will read root URLs from _stdin_ if none are provided.\n" "Crawl writes every discovered image URL in a new line to _stdout_.\n" "Arguments specifiying the duration of crawl are on a per-root-URL basis.\n" "\n" "Crawl identifies image URLs based on known image file extensions like `.png`.\n" "Crawl is not expected to verify that URLs are images and may produce false positives.\n" "\n" "Optional Arguments\n" "------------------\n" "* -auto - Crawl chooses its own root URL (must be specified otherwise).\n" "* -depth - The levels to recursively search (unlimited otherwise).\n" "* -depthFirst - Depth-first search (breadth-first otherwise).\n" "* -help - Print usage information.\n" "* -images - The number of image URLs to obtain (unlimited otherwise).\n" "* -json - Output JSON instead or URLs.\n" "* -time - The seconds to spend searching for images (unlimited otherwise).\n"); } static const char *root = NULL; static bool autoRoot = false; static int depth = INT_MAX; static bool depthFirst = false; static int images = INT_MAX; static bool json = false; static int timeLimit = INT_MAX; static QTime elapsed; static int currentImages = 0; static void crawl(QFileInfo url, int currentDepth = 0) { if ((currentImages >= images) || (currentDepth >= depth) || (elapsed.elapsed()/1000 >= timeLimit)) return; if (url.filePath().startsWith("file://")) url = QFileInfo(url.filePath().mid(7)); if (url.isDir()) { const QDir dir(url.absoluteFilePath()); const QFileInfoList files = dir.entryInfoList(QDir::Files); const QFileInfoList subdirs = dir.entryInfoList(QDir::Dirs | QDir::NoDotAndDotDot); foreach (const QFileInfo &first, depthFirst ? subdirs : files) crawl(first, currentDepth + 1); foreach (const QFileInfo &second, depthFirst ? files : subdirs) crawl(second, currentDepth + 1); } else if (url.isFile()) { const QString suffix = url.suffix(); if ((suffix == "bmp") || (suffix == "jpg") || (suffix == "jpeg") || (suffix == "png") || (suffix == "tiff")) { printf(json ? "{ \"URL\" : \"file://%s\" }\n" : "file://%s\n", qPrintable(url.canonicalFilePath())); fflush(stdout); currentImages++; } } } int main(int argc, char *argv[]) { for (int i=1; i