First Commit of mft carver and mft parsers

2025-02-16 18:57:07 +01:00
commit 1973dc7031
2 changed files with 1006 additions and 0 deletions
--- a/mft.go
+++ b/mft.go
@@ -0,0 +1,496 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/base64"
+	"encoding/binary"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"time"
+	"unicode/utf16"
+)
+
+// --------------------
+// Data Structures
+// --------------------
+
+type FileNameAttribute struct {
+	Filename  string `json:"filename"`
+	ParentRef uint64 `json:"parent_ref"`
+	Crtime    string `json:"crtime"`
+	Mtime     string `json:"mtime"`
+	CtTime    string `json:"ctime"`
+	Atime     string `json:"atime"`
+}
+
+type DataStream struct {
+	Name          string `json:"name"`
+	Resident      bool   `json:"resident"`
+	NonResident   bool   `json:"non_resident"`
+	ContentBase64 string `json:"content_base64,omitempty"`
+}
+
+type MFTRecord struct {
+	// Header fields (omitting the "magic" since it's always "FILE")
+	UpdateSeqOffset uint16 `json:"update_seq_offset"`
+	UpdateSeqSize   uint16 `json:"update_seq_size"`
+	LSN             uint64 `json:"lsn"`
+	SequenceNumber  uint16 `json:"sequence_number"`
+	HardLinkCount   uint16 `json:"hard_link_count"`
+	FirstAttrOffset uint16 `json:"first_attr_offset"`
+	Flags           uint16 `json:"flags"`
+	RealSize        uint32 `json:"real_size"`
+	AllocatedSize   uint32 `json:"allocated_size"`
+	BaseFileRecord  uint64 `json:"base_file_record"`
+	NextAttrId      uint16 `json:"next_attr_id"`
+	RecordNumber    uint32 `json:"record_number"`
+
+	// Standard Information attribute timestamps (if present)
+	SI_Crtime string `json:"si_crtime,omitempty"`
+	SI_Mtime  string `json:"si_mtime,omitempty"`
+	SI_CtTime string `json:"si_ctime,omitempty"`
+	SI_Atime  string `json:"si_atime,omitempty"`
+
+	// Additional attributes
+	ObjectID           string                 `json:"object_id,omitempty"`
+	SecurityDescriptor map[string]interface{} `json:"security_descriptor,omitempty"`
+
+	FileNames   []FileNameAttribute `json:"file_names,omitempty"`
+	DataStreams []DataStream        `json:"data_streams,omitempty"`
+}
+
+// CarvedRecord holds a candidate 1024-byte record and its global offset.
+type CarvedRecord struct {
+	Offset      int64
+	RecordBytes []byte
+}
+
+// ParsedRecord is what gets sent to the JSON writer.
+type ParsedRecord struct {
+	Offset     int64
+	RecordJSON map[string]interface{}
+}
+
+// --------------------
+// Helper functions
+// --------------------
+
+// safeSlice returns data[start : start+length] if within bounds; otherwise false.
+func safeSlice(data []byte, start int, length int) ([]byte, bool) {
+	if start < 0 || start+length > len(data) {
+		return nil, false
+	}
+	return data[start : start+length], true
+}
+
+// filetimeToString converts a Windows FILETIME (uint64) into an RFC3339 timestamp string.
+func filetimeToString(ft uint64) string {
+	const epochDiff = 11644473600 // seconds between 1601 and 1970
+	secs := int64(ft/10000000) - epochDiff
+	nsec := int64(ft%10000000) * 100
+	t := time.Unix(secs, nsec).UTC()
+	return t.Format(time.RFC3339)
+}
+
+// decodeUTF16String converts little‑endian UTF‑16 bytes to a Go string.
+func decodeUTF16String(b []byte) string {
+	if len(b)%2 != 0 {
+		b = b[:len(b)-1]
+	}
+	u16 := make([]uint16, len(b)/2)
+	for i := 0; i < len(u16); i++ {
+		u16[i] = binary.LittleEndian.Uint16(b[i*2:])
+	}
+	return string(utf16.Decode(u16))
+}
+
+// parseZoneIdentifier is a simple parser for Zone.Identifier streams.
+func parseZoneIdentifier(content []byte) map[string]string {
+	result := make(map[string]string)
+	text := string(content)
+	lines := strings.Split(text, "\n")
+	for _, line := range lines {
+		if parts := strings.SplitN(line, "=", 2); len(parts) == 2 {
+			key := strings.TrimSpace(parts[0])
+			value := strings.TrimSpace(parts[1])
+			result[key] = value
+		}
+	}
+	return result
+}
+
+// --------------------
+// Attribute Parsing
+// --------------------
+
+// parseAttributes iterates over the attribute area and processes known types.
+// It uses safeSlice to ensure we don’t read beyond the record.
+func parseAttributes(data []byte, rec *MFTRecord) {
+	offset := int(rec.FirstAttrOffset)
+	for offset < len(data)-8 {
+		// First 4 bytes: attribute type
+		if attrBytes, ok := safeSlice(data, offset, 4); !ok {
+			break
+		} else {
+			attrType := binary.LittleEndian.Uint32(attrBytes)
+			// End marker
+			if attrType == 0xFFFFFFFF {
+				break
+			}
+			// Next 4 bytes: attribute length
+			if attrLenBytes, ok := safeSlice(data, offset+4, 4); !ok {
+				break
+			} else {
+				attrLen := binary.LittleEndian.Uint32(attrLenBytes)
+				if attrLen < 8 || offset+int(attrLen) > len(data) {
+					// Malformed attribute, skip one byte and try to re-sync.
+					offset++
+					continue
+				}
+				// Resident flag at offset+8.
+				residentFlag := data[offset+8]
+				// For resident attributes, extract value length and value offset.
+				var valLen uint32
+				var valOffset uint16
+				if residentFlag == 0 {
+					if vb, ok := safeSlice(data, offset+16, 4); ok {
+						valLen = binary.LittleEndian.Uint32(vb)
+					} else {
+						offset += int(attrLen)
+						continue
+					}
+					if vb, ok := safeSlice(data, offset+20, 2); ok {
+						valOffset = binary.LittleEndian.Uint16(vb)
+					} else {
+						offset += int(attrLen)
+						continue
+					}
+				}
+				// Process known attribute types.
+				switch attrType {
+				case 0x10: // $STANDARD_INFORMATION
+					if residentFlag == 0 {
+						if siData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(siData) >= 32 {
+							rec.SI_Crtime = filetimeToString(binary.LittleEndian.Uint64(siData[0:8]))
+							rec.SI_Mtime = filetimeToString(binary.LittleEndian.Uint64(siData[8:16]))
+							rec.SI_CtTime = filetimeToString(binary.LittleEndian.Uint64(siData[16:24]))
+							rec.SI_Atime = filetimeToString(binary.LittleEndian.Uint64(siData[24:32]))
+						}
+					}
+				case 0x30: // $FILE_NAME
+					if residentFlag == 0 {
+						if fnData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(fnData) >= 66 {
+							var fn FileNameAttribute
+							fn.ParentRef = binary.LittleEndian.Uint64(fnData[0:8]) & 0x0000FFFFFFFFFFFF
+							fn.Crtime = filetimeToString(binary.LittleEndian.Uint64(fnData[8:16]))
+							fn.Mtime = filetimeToString(binary.LittleEndian.Uint64(fnData[16:24]))
+							fn.CtTime = filetimeToString(binary.LittleEndian.Uint64(fnData[24:32]))
+							fn.Atime = filetimeToString(binary.LittleEndian.Uint64(fnData[32:40]))
+							if filenameLenBytes, ok := safeSlice(fnData, 64, 1); ok {
+								filenameLen := filenameLenBytes[0]
+								if nameBytes, ok := safeSlice(fnData, 66, int(filenameLen)*2); ok {
+									fn.Filename = decodeUTF16String(nameBytes)
+								}
+							}
+							rec.FileNames = append(rec.FileNames, fn)
+						}
+					}
+				case 0x80: // $DATA
+					var ds DataStream
+					// Extract the attribute's name if any.
+					if nameInfo, ok := safeSlice(data, offset+9, 1); ok {
+						nameLen := nameInfo[0]
+						if nameBytes, ok := safeSlice(data, offset+10, int(nameLen)*2); ok && nameLen > 0 {
+							ds.Name = decodeUTF16String(nameBytes)
+						}
+					}
+					if residentFlag == 0 {
+						ds.Resident = true
+						ds.NonResident = false
+						if content, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok {
+							ds.ContentBase64 = base64.StdEncoding.EncodeToString(content)
+							if ds.Name == "Zone.Identifier" {
+								zoneInfo := parseZoneIdentifier(content)
+								ds.Name = fmt.Sprintf("Zone.Identifier %v", zoneInfo)
+							}
+						}
+					} else {
+						ds.Resident = false
+						ds.NonResident = true
+					}
+					rec.DataStreams = append(rec.DataStreams, ds)
+				case 0x40: // $OBJECT_ID (when used as such)
+					if residentFlag == 0 {
+						if objData, ok := safeSlice(data, offset+int(valOffset), 16); ok {
+							rec.ObjectID = fmt.Sprintf("%x", objData)
+						}
+					}
+				case 0x50: // $SECURITY_DESCRIPTOR
+					if residentFlag == 0 {
+						// A minimal parser: extract the first 20 bytes fields if possible.
+						if secData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(secData) >= 20 {
+							// We could decode further; here we just store raw hex values.
+							rec.SecurityDescriptor = map[string]interface{}{
+								"raw": fmt.Sprintf("%x", secData),
+							}
+						}
+					}
+					// (Other attribute types such as $ATTRIBUTE_LIST, $VOLUME_NAME, etc.
+					// can be added here following similar patterns.)
+				}
+				offset += int(attrLen)
+			}
+		}
+	}
+}
+
+// parseMFTRecord attempts to parse a 1024-byte MFT record.
+// It returns an error if the record is too short or if the expected "FILE" marker is missing.
+func parseMFTRecord(data []byte) (*MFTRecord, error) {
+	if len(data) < 46 {
+		return nil, fmt.Errorf("data too short to be a valid record")
+	}
+	if string(data[:4]) != "FILE" {
+		return nil, fmt.Errorf("invalid record header")
+	}
+	rec := &MFTRecord{
+		UpdateSeqOffset: binary.LittleEndian.Uint16(data[4:6]),
+		UpdateSeqSize:   binary.LittleEndian.Uint16(data[6:8]),
+		LSN:             binary.LittleEndian.Uint64(data[8:16]),
+		SequenceNumber:  binary.LittleEndian.Uint16(data[16:18]),
+		HardLinkCount:   binary.LittleEndian.Uint16(data[18:20]),
+		FirstAttrOffset: binary.LittleEndian.Uint16(data[20:22]),
+		Flags:           binary.LittleEndian.Uint16(data[22:24]),
+		RealSize:        binary.LittleEndian.Uint32(data[24:28]),
+		AllocatedSize:   binary.LittleEndian.Uint32(data[28:32]),
+		BaseFileRecord:  binary.LittleEndian.Uint64(data[32:40]),
+		NextAttrId:      binary.LittleEndian.Uint16(data[40:42]),
+		RecordNumber:    binary.LittleEndian.Uint32(data[42:46]),
+	}
+	parseAttributes(data, rec)
+	return rec, nil
+}
+
+// --------------------
+// Parallel Processing and Main
+// --------------------
+
+func processImageFile(inputFile string, wg *sync.WaitGroup) {
+	defer wg.Done()
+
+	f, err := os.Open(inputFile)
+	if err != nil {
+		log.Printf("Failed to open %s: %v", inputFile, err)
+		return
+	}
+	defer f.Close()
+
+	fi, err := f.Stat()
+	if err != nil {
+		log.Printf("Failed to stat %s: %v", inputFile, err)
+		return
+	}
+	fileSize := fi.Size()
+
+	// Create an output folder and JSONL file based on input file name and current timestamp.
+	timestamp := time.Now().Format("20060102150405")
+	baseName := filepath.Base(inputFile)
+	outDir := fmt.Sprintf("%s_%s", baseName, timestamp)
+	if err := os.Mkdir(outDir, 0755); err != nil {
+		log.Printf("Failed to create output directory for %s: %v", inputFile, err)
+		return
+	}
+	jsonlFileName := fmt.Sprintf("%s_%s.jsonl", baseName, timestamp)
+	jsonlFile, err := os.Create(jsonlFileName)
+	if err != nil {
+		log.Printf("Failed to create JSONL file for %s: %v", inputFile, err)
+		return
+	}
+	defer jsonlFile.Close()
+
+	carvedChan := make(chan CarvedRecord, 100)
+	parsedChan := make(chan ParsedRecord, 100)
+
+	// Worker pool for carving/parsing records.
+	numWorkers := runtime.NumCPU()
+	var workerWg sync.WaitGroup
+	for i := 0; i < numWorkers; i++ {
+		workerWg.Add(1)
+		go func() {
+			defer workerWg.Done()
+			for carved := range carvedChan {
+				// Write raw record to disk.
+				recordFileName := filepath.Join(outDir, fmt.Sprintf("%d.mftrecord", carved.Offset))
+				if err := os.WriteFile(recordFileName, carved.RecordBytes, 0644); err != nil {
+					log.Printf("Failed to write record at offset %d in %s: %v", carved.Offset, inputFile, err)
+					continue
+				}
+				// Parse the record.
+				mft, err := parseMFTRecord(carved.RecordBytes)
+				if err != nil {
+					// Skip records that cannot be parsed.
+					continue
+				}
+				// Build JSON record (omitting the magic field).
+				recordMap := map[string]interface{}{
+					"input_image":         inputFile,
+					"offset":              carved.Offset,
+					"update_seq_offset":   mft.UpdateSeqOffset,
+					"update_seq_size":     mft.UpdateSeqSize,
+					"lsn":                 mft.LSN,
+					"sequence_number":     mft.SequenceNumber,
+					"hard_link_count":     mft.HardLinkCount,
+					"first_attr_offset":   mft.FirstAttrOffset,
+					"flags":               mft.Flags,
+					"real_size":           mft.RealSize,
+					"allocated_size":      mft.AllocatedSize,
+					"base_file_record":    mft.BaseFileRecord,
+					"next_attr_id":        mft.NextAttrId,
+					"record_number":       mft.RecordNumber,
+					"si_crtime":           mft.SI_Crtime,
+					"si_mtime":            mft.SI_Mtime,
+					"si_ctime":            mft.SI_CtTime,
+					"si_atime":            mft.SI_Atime,
+					"object_id":           mft.ObjectID,
+					"security_descriptor": mft.SecurityDescriptor,
+					"file_names":          mft.FileNames,
+					"data_streams":        mft.DataStreams,
+				}
+				parsedChan <- ParsedRecord{Offset: carved.Offset, RecordJSON: recordMap}
+			}
+		}()
+	}
+
+	// Writer goroutine to output JSONL records.
+	var writerWg sync.WaitGroup
+	writerWg.Add(1)
+	go func() {
+		defer writerWg.Done()
+		encoder := json.NewEncoder(jsonlFile)
+		for pr := range parsedChan {
+			if err := encoder.Encode(pr.RecordJSON); err != nil {
+				log.Printf("Error writing JSON record at offset %d in %s: %v", pr.Offset, inputFile, err)
+			}
+		}
+	}()
+
+	// Scan the file for the "FILE0" pattern.
+	reader := bufio.NewReader(f)
+	const recordSize = 1024
+	const chunkSize = 1024 * 1024
+	pattern := []byte("FILE0")
+	var fileOffset int64 = 0
+	var leftover []byte
+	lastPrint := time.Now()
+
+	for {
+		chunk := make([]byte, chunkSize)
+		n, err := reader.Read(chunk)
+		if n == 0 {
+			break
+		}
+		data := append(leftover, chunk[:n]...)
+		// Progress update every ~5 seconds.
+		if time.Since(lastPrint) > 5*time.Second {
+			perc := float64(fileOffset) / float64(fileSize) * 100.0
+			log.Printf("Processing %s: %.2f%% complete", inputFile, perc)
+			lastPrint = time.Now()
+		}
+		searchLimit := len(data) - len(pattern)
+		for i := 0; i <= searchLimit; i++ {
+			if bytes.Equal(data[i:i+len(pattern)], pattern) {
+				globalOffset := fileOffset - int64(len(leftover)) + int64(i)
+				if globalOffset+recordSize > fileSize {
+					continue
+				}
+				recordBytes := make([]byte, recordSize)
+				_, err := f.ReadAt(recordBytes, globalOffset)
+				if err != nil {
+					log.Printf("Failed to read record at offset %d in %s: %v", globalOffset, inputFile, err)
+					continue
+				}
+				carvedChan <- CarvedRecord{Offset: globalOffset, RecordBytes: recordBytes}
+			}
+		}
+		if len(data) >= len(pattern)-1 {
+			leftover = data[len(data)-(len(pattern)-1):]
+		} else {
+			leftover = data
+		}
+		fileOffset += int64(n)
+		if err == io.EOF {
+			break
+		}
+	}
+
+	close(carvedChan)
+	workerWg.Wait()
+	close(parsedChan)
+	writerWg.Wait()
+	log.Printf("Finished processing %s. Raw records are in %s and JSONL file is %s", inputFile, outDir, jsonlFileName)
+}
+
+func main() {
+	flag.Parse()
+	if flag.NArg() == 0 {
+		fmt.Printf("Usage: %s <disk image files or directories>\n", os.Args[0])
+		os.Exit(1)
+	}
+
+	// Build list of files from provided arguments (recursively if directories).
+	var files []string
+	for _, arg := range flag.Args() {
+		fi, err := os.Stat(arg)
+		if err != nil {
+			log.Printf("Error stating %s: %v", arg, err)
+			continue
+		}
+		if fi.IsDir() {
+			err := filepath.Walk(arg, func(path string, info os.FileInfo, err error) error {
+				if err != nil {
+					return nil
+				}
+				if !info.IsDir() {
+					files = append(files, path)
+				}
+				return nil
+			})
+			if err != nil {
+				log.Printf("Error walking directory %s: %v", arg, err)
+			}
+		} else {
+			files = append(files, arg)
+		}
+	}
+
+	totalFiles := len(files)
+	if totalFiles == 0 {
+		log.Println("No input files found.")
+		return
+	}
+	log.Printf("Found %d files to process.", totalFiles)
+
+	var wg sync.WaitGroup
+	concurrentFiles := runtime.NumCPU()
+	sem := make(chan struct{}, concurrentFiles)
+	for i, file := range files {
+		wg.Add(1)
+		sem <- struct{}{}
+		go func(i int, file string) {
+			defer func() { <-sem }()
+			log.Printf("Starting file %d/%d: %s", i+1, totalFiles, file)
+			processImageFile(file, &wg)
+		}(i, file)
+	}
+	wg.Wait()
+	log.Println("All files processed.")
+}