aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--README.md65
-rwxr-xr-xdirstat.py91
-rw-r--r--schema.sql10
4 files changed, 168 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..734b16e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.env
+dirstat.sqlite3
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4eac4ad
--- /dev/null
+++ b/README.md
@@ -0,0 +1,65 @@
+### Introduction
+
+`dirstat` is a fairly crude, but surprisingly useful tool to generate different statistics about directory structures.
+
+
+### Usage
+
+Let's say you have a network share mounted at /Volumes/SHARE. Let's import it into dirstat:
+
+ $ dirstat.py import /Volumes/SHARE
+
+
+After the tool is finished, we can use the sqlite3 command line to get some insight into the tree, without disturbing the
+file system itself. Here are just a few examples:
+
+#### Top 10 file types by count
+
+ sqlite> select type, count(*) as c from paths where import_id = 1 group by type order by c desc limit 10;
+ JPG|172592
+ PDF|124481
+ PNG|74889
+ DWG|51314
+ GSM|40811
+ MXS|14726
+ DOC|6479
+ PSD|5640
+ MXI|4745
+ PLT|4684
+
+#### Space usage in gigabytes by year (of modification date):
+
+ sqlite> select strftime('%Y', mtime) as y, sum(size)/1024/1024/1024 as s from paths group by y order by s desc;
+ 2017|1300
+ 2016|1256
+ 2014|994
+ 2015|974
+ 2013|584
+ 2012|260
+
+#### File types and their counts in a given subdirectory:
+
+ sqlite> select type, count(*) from paths where path like '%/PROJECTS/ACTIVE/%' group by type;
+ 3DM|3
+ 3DMBAK|3
+ AI|1
+ BMP|1
+ DIR|51
+ DOC|65
+ DOCX|7
+ DOTX|1
+ DWF|1
+ DWG|8
+
+
+### System Requirements
+
+- Python 3 (tested with 3.6.3)
+
+
+### License
+
+ Copyright © 2017 Filipp Lepalaan <filipp@mac.com>
+ This work is free. You can redistribute it and/or modify it under the
+ terms of the Do What The Fuck You Want To Public License, Version 2,
+ as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
diff --git a/dirstat.py b/dirstat.py
new file mode 100755
index 0000000..e403334
--- /dev/null
+++ b/dirstat.py
@@ -0,0 +1,91 @@
+#! /usr/bin/env python
+
+import re
+import os
+import sys
+import time
+import sqlite3
+from os.path import join, getsize, basename
+
+DBPATH = 'dirstat.sqlite3'
+
+if not os.path.exists(DBPATH):
+ os.system('sqlite3 ' + DBPATH + ' < schema.sql')
+
+DB = sqlite3.connect(DBPATH)
+SKIPNAMES = re.compile(r'^\.')
+EXTENSION = re.compile(r'.+\.(\w+)$')
+
+def stats(sql):
+ for r in DB.execute(sql):
+ print(r)
+
+
+def scandir(path, overwrite=True):
+ if not os.path.exists(path):
+ raise Exception('Invalid path: %s' % path)
+
+ if not overwrite:
+ DB.execute('SELECT id FROM imports WHERE path = ?', (path,))
+ if DB.fetchone():
+ raise Exception('Path %s already imported' % path)
+
+ DB.execute('INSERT INTO imports(path) VALUES (?)', (path,))
+ cursor = DB.execute('SELECT last_insert_rowid() FROM imports')
+ import_id = cursor.fetchone()[0]
+
+ total_size = 0
+ total_count = 0
+ sql = 'INSERT INTO paths(path, type, mtime, size, import_id) VALUES (?,?,?,?, ?)'
+
+ start_time = time.time()
+
+ for root, subdirs, files in os.walk(path):
+ for f in files:
+ t = 'DIR'
+ fp = unicode(join(root, f), 'UTF-8')
+ if SKIPNAMES.match(f):
+ continue
+
+ result = EXTENSION.match(f)
+
+ if result and result.group(0):
+ t = result.group(1).upper()
+
+ s = os.lstat(fp)
+
+ print 'Importing %s' % fp
+ DB.execute(sql, (fp, t, s.st_mtime, s.st_size, import_id,))
+ total_count += 1
+ total_size += s.st_size
+
+ # Convert to timestamps
+ DB.execute("UPDATE paths SET mtime = datetime(mtime, 'unixepoch', 'localtime')")
+ DB.execute("UPDATE imports SET endtime = datetime('now'), count = ?, size = ? WHERE id = ?",
+ (import_id, total_count, total_size,))
+ DB.commit()
+
+ print '{0} items imported from {2} ({1} bytes)'.format(total_count, total_size, path)
+
+
+if __name__ == '__main__':
+ if sys.argv[1] == 'import':
+ path = sys.argv[2]
+ scandir(path)
+ elif sys.argv[1] == 'stats':
+ query = 'SELECT type, size FROM paths GROUP BY type'
+ if len(sys.argv) == 3:
+ query = sys.argv[2]
+ stats(query)
+ elif sys.argv[1] == 'clear':
+ DB.execute('DELETE FROM paths')
+ DB.commit()
+ elif sys.argv[1] == 'shell':
+ while True:
+ cmd = raw_input()
+ try:
+ for r in DB.execute(cmd):
+ s = []
+ print(u', '.join(r))
+ except sqlite3.OperationalError as e:
+ print e
diff --git a/schema.sql b/schema.sql
new file mode 100644
index 0000000..eb74429
--- /dev/null
+++ b/schema.sql
@@ -0,0 +1,10 @@
+CREATE TABLE imports (id INTEGER PRIMARY KEY AUTOINCREMENT,
+ path TEXT,
+ starttime TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ endtime TIMESTAMP,
+ count INTEGER,
+ size INTEGER);
+CREATE TABLE paths (import_id INTEGER, path TEXT, type TEXT, mtime TIMESTAMP, size INTEGER);
+CREATE index idx1 ON paths(path);
+CREATE index idx2 ON paths(type);
+CREATE index idx3 ON imports(path);