--- searchmonkey-0.8.1.old/src/search.c +++ searchmonkey-0.8.1/src/search.c @@ -434,6 +434,203 @@ g_thread_exit (GINT_TO_POINTER(0)); } +//-------------------------------------------------------------------- +// extra file-processing code +//-------------------------------------------------------------------- + +#include + +#ifndef ZERO +#define ZERO 0 +#endif + +#ifndef ONE +#define ONE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#define ALT_CHECKCON_SIZE_BLOCK 512 +#define ALT_CHECKCON_SIZE_TRIGGER 250000L + +//-------------------------------------------------------------------- + +// "AltSkipByPath" is a support routine for "AltSkipFile". This rou- +// tine performs some pathname-based checks on a specified file. + +// "path" should be an absolute path for a regular file. "AltSkipBy- +// Path (path)" returns one of the following three values: +// +// TRUE -- This indicates that "AltSkipFile" should return TRUE +// for the specified file; i.e., that the file should be +// rejected. +// +// FALSE -- This indicates that "AltSkipByPath" didn't find any +// reason to reject the specified file. +// +// -1 -- This is a special case. Minus one indicates that "Alt- +// SkipByPath" believes that "AltSkipFile" should return +// FALSE for the specified file; i.e., that "AltSkipFile" +// should pass the file through without additional checks. + +static int AltSkipByPath (char *path) +{ + char *ext; // Filename extension [after the dot] + int ii; // Zero-based index + char *cp; // Scratch (string ) + int n; // Scratch (integer) + +//-------------------------------------------------------------------- +// To be documented. + + static char *SkipExt[] = + { + "7z" , "arj" , "asf" , "avi" , "bmp" , "bz2" , "cab" , + "class", "cpio" , "dll" , "exe" , "eps" , "flac" , "flv" , + "gz" , "gif" , "hqx" , "ico" , "iso" , "jar" , "jpe" , + "jpeg" , "jpg" , "lha" , "m4v" , "mov" , "mp2" , "mp3" , + "mp4" , "mpe" , "mpeg" , "mpg" , "o" , "ogg" , "ogm" , + "pcx" , "png" , "pnm" , "ppm" , "ppt" , "qts" , "qtx" , + "rar" , "rm" , "rpm" , "rv" , "swf" , "tar" , "taz" , + "tbz" , "tga" , "tgz" , "tif" , "tiff" , "ttf" , "vxd" , + "wav" , "wma" , "wmv" , "xpi" , "z" , "zip" , + NULL + }; + +//-------------------------------------------------------------------- +// Safety measure. + +// The following two "strncmp" lines skip two directory trees that +// it's not safe to traverse. + + if (!strncmp (path, "/proc/" , 6)) return TRUE; + if (!strncmp (path, "/sys/" , 5)) return TRUE; + +//-------------------------------------------------------------------- +// To be documented. + + cp = strrchr (path, '.'); + if (cp == NULL) return FALSE; + cp++; + if (strchr (cp, '/') != NULL) return FALSE; + n = strlen (cp); + if (n < ONE) return FALSE; + ext = cp; + + if (!strcasecmp (ext, "pdf")) return -1; + + for (ii = ZERO; (cp = SkipExt [ii]) != NULL; ii++) + { + if (!strcasecmp (ext, cp)) return TRUE; + } + + return FALSE; +} + +//-------------------------------------------------------------------- + +// "AltSkipByContents" is a support routine for "AltSkipFile". This +// routine performs some checks on a specified file related to meta- +// information and/or contents. + +// "path" should be an absolute path for a regular file. "AltSkipBy- +// Contents (path)" returns one of the following three values: +// +// TRUE -- This indicates that "AltSkipFile" should return TRUE +// for the specified file; i.e., that the file should be +// rejected. +// +// FALSE -- This indicates that "AltSkipByContents" didn't find any +// reason to reject the specified file. +// +// -1 -- This is a special case. Minus one indicates that "Alt- +// SkipByContents" believes that "AltSkipFile" should re- +// turn FALSE for the specified file; i.e., that "AltSkip- +// File" should pass the file through without additional +// checks. + +// Technical note: This routine uses heuristics that may cause some +// non-ASCII text files to be rejected. + +static int AltSkipByContents (char *path) +{ + // Small data buffer + unsigned char data [ALT_CHECKCON_SIZE_BLOCK]; + + long long FileSize; // File size [in bytes] + int c; // Byte [stored as an integer] + int ifd; // "read"-level input descriptor + int ii; // Zero-based index + int n; // Scratch + int nb; // Number of bytes in data buffer + struct stat sbuf; // File-status information buffer + + if (stat (path, &sbuf) < ZERO) return TRUE; + if (!S_ISREG (sbuf.st_mode)) return FALSE; + FileSize = (long long) sbuf.st_size; + if (FileSize < ALT_CHECKCON_SIZE_TRIGGER) return FALSE; + if ((ifd = open (path, ZERO)) < ZERO) return TRUE; + + nb = ALT_CHECKCON_SIZE_BLOCK; + if (nb > FileSize) nb = FileSize; + if (read (ifd, data, nb) != nb) return TRUE; + close (ifd); + + for (n = ii = ZERO; ii < nb; ii++) + { + c = 0xFF & (int) data [ii]; + + switch (c) + { + case 011: case 012: case 014: case 015: continue; + } + + if ((c >= ZERO) && (c < ' ')) n++; + } + + n = (n * 100) / ALT_CHECKCON_SIZE_BLOCK; + if (n > 4) return (TRUE); + + return FALSE; +} + +//-------------------------------------------------------------------- + +// "path" should be an absolute pathname. "AltSkipFile (path)" per- +// forms various checks on the specified object and returns one of the +// following two values: +// +// TRUE -- This indicates that the caller should not attempt to +// process the specified object. +// +// FALSE -- This indicates that "AltSkipFile" didn't find any rea- +// son to reject the specified object. + +static int AltSkipFile (char *path) +{ + int n; + + n = AltSkipByPath (path); + if (n > ZERO) return TRUE; + if (n < ZERO) return FALSE; + + n = AltSkipByContents (path); + if (n > ZERO) return TRUE; + if (n < ZERO) return FALSE; + +// Note: Additional checks could be added here. However, this is opt- +// ional. + + return FALSE; +} + +//-------------------------------------------------------------------- /* * POSIX threaded: phase one of main search loop. @@ -571,6 +768,13 @@ continue; } } + + if (AltSkipFile (tmpFullFileName)) + { + g_free (tmpFullFileName); + g_free (tmpFileName); + continue; + } /* Start working with the new folder name */ if (g_file_test(tmpFullFileName, G_FILE_TEST_IS_DIR)) {