From 8d560d6cf854fea81e0e0a2b8a22c406de4febb3 Mon Sep 17 00:00:00 2001 From: Filipp Lepalaan Date: Thu, 19 Oct 2017 11:20:55 +0300 Subject: Initial commit --- .gitignore | 2 ++ README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++ dirstat.py | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ schema.sql | 10 +++++++ 4 files changed, 168 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 dirstat.py create mode 100644 schema.sql diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..734b16e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +dirstat.sqlite3 diff --git a/README.md b/README.md new file mode 100644 index 0000000..4eac4ad --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +### Introduction + +`dirstat` is a fairly crude, but surprisingly useful tool to generate different statistics about directory structures. + + +### Usage + +Let's say you have a network share mounted at /Volumes/SHARE. Let's import it into dirstat: + + $ dirstat.py import /Volumes/SHARE + + +After the tool is finished, we can use the sqlite3 command line to get some insight into the tree, without disturbing the +file system itself. Here are just a few examples: + +#### Top 10 file types by count + + sqlite> select type, count(*) as c from paths where import_id = 1 group by type order by c desc limit 10; + JPG|172592 + PDF|124481 + PNG|74889 + DWG|51314 + GSM|40811 + MXS|14726 + DOC|6479 + PSD|5640 + MXI|4745 + PLT|4684 + +#### Space usage in gigabytes by year (of modification date): + + sqlite> select strftime('%Y', mtime) as y, sum(size)/1024/1024/1024 as s from paths group by y order by s desc; + 2017|1300 + 2016|1256 + 2014|994 + 2015|974 + 2013|584 + 2012|260 + +#### File types and their counts in a given subdirectory: + + sqlite> select type, count(*) from paths where path like '%/PROJECTS/ACTIVE/%' group by type; + 3DM|3 + 3DMBAK|3 + AI|1 + BMP|1 + DIR|51 + DOC|65 + DOCX|7 + DOTX|1 + DWF|1 + DWG|8 + + +### System Requirements + +- Python 3 (tested with 3.6.3) + + +### License + + Copyright © 2017 Filipp Lepalaan + This work is free. You can redistribute it and/or modify it under the + terms of the Do What The Fuck You Want To Public License, Version 2, + as published by Sam Hocevar. See http://www.wtfpl.net/ for more details. diff --git a/dirstat.py b/dirstat.py new file mode 100755 index 0000000..e403334 --- /dev/null +++ b/dirstat.py @@ -0,0 +1,91 @@ +#! /usr/bin/env python + +import re +import os +import sys +import time +import sqlite3 +from os.path import join, getsize, basename + +DBPATH = 'dirstat.sqlite3' + +if not os.path.exists(DBPATH): + os.system('sqlite3 ' + DBPATH + ' < schema.sql') + +DB = sqlite3.connect(DBPATH) +SKIPNAMES = re.compile(r'^\.') +EXTENSION = re.compile(r'.+\.(\w+)$') + +def stats(sql): + for r in DB.execute(sql): + print(r) + + +def scandir(path, overwrite=True): + if not os.path.exists(path): + raise Exception('Invalid path: %s' % path) + + if not overwrite: + DB.execute('SELECT id FROM imports WHERE path = ?', (path,)) + if DB.fetchone(): + raise Exception('Path %s already imported' % path) + + DB.execute('INSERT INTO imports(path) VALUES (?)', (path,)) + cursor = DB.execute('SELECT last_insert_rowid() FROM imports') + import_id = cursor.fetchone()[0] + + total_size = 0 + total_count = 0 + sql = 'INSERT INTO paths(path, type, mtime, size, import_id) VALUES (?,?,?,?, ?)' + + start_time = time.time() + + for root, subdirs, files in os.walk(path): + for f in files: + t = 'DIR' + fp = unicode(join(root, f), 'UTF-8') + if SKIPNAMES.match(f): + continue + + result = EXTENSION.match(f) + + if result and result.group(0): + t = result.group(1).upper() + + s = os.lstat(fp) + + print 'Importing %s' % fp + DB.execute(sql, (fp, t, s.st_mtime, s.st_size, import_id,)) + total_count += 1 + total_size += s.st_size + + # Convert to timestamps + DB.execute("UPDATE paths SET mtime = datetime(mtime, 'unixepoch', 'localtime')") + DB.execute("UPDATE imports SET endtime = datetime('now'), count = ?, size = ? WHERE id = ?", + (import_id, total_count, total_size,)) + DB.commit() + + print '{0} items imported from {2} ({1} bytes)'.format(total_count, total_size, path) + + +if __name__ == '__main__': + if sys.argv[1] == 'import': + path = sys.argv[2] + scandir(path) + elif sys.argv[1] == 'stats': + query = 'SELECT type, size FROM paths GROUP BY type' + if len(sys.argv) == 3: + query = sys.argv[2] + stats(query) + elif sys.argv[1] == 'clear': + DB.execute('DELETE FROM paths') + DB.commit() + elif sys.argv[1] == 'shell': + while True: + cmd = raw_input() + try: + for r in DB.execute(cmd): + s = [] + print(u', '.join(r)) + except sqlite3.OperationalError as e: + print e diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..eb74429 --- /dev/null +++ b/schema.sql @@ -0,0 +1,10 @@ +CREATE TABLE imports (id INTEGER PRIMARY KEY AUTOINCREMENT, + path TEXT, + starttime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + endtime TIMESTAMP, + count INTEGER, + size INTEGER); +CREATE TABLE paths (import_id INTEGER, path TEXT, type TEXT, mtime TIMESTAMP, size INTEGER); +CREATE index idx1 ON paths(path); +CREATE index idx2 ON paths(type); +CREATE index idx3 ON imports(path); -- cgit v1.2.3