mft/mft.go

package main

import (
	"bufio"
	"bytes"
	"encoding/base64"
	"encoding/binary"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"
	"runtime"
	"strings"
	"sync"
	"time"
	"unicode/utf16"
)

// --------------------
// Data Structures
// --------------------

type FileNameAttribute struct {
	Filename  string `json:"filename"`
	ParentRef uint64 `json:"parent_ref"`
	Crtime    string `json:"crtime"`
	Mtime     string `json:"mtime"`
	CtTime    string `json:"ctime"`
	Atime     string `json:"atime"`
}

type DataStream struct {
	Name          string `json:"name"`
	Resident      bool   `json:"resident"`
	NonResident   bool   `json:"non_resident"`
	ContentBase64 string `json:"content_base64,omitempty"`
}

type MFTRecord struct {
	// Header fields (omitting the "magic" since it's always "FILE")
	UpdateSeqOffset uint16 `json:"update_seq_offset"`
	UpdateSeqSize   uint16 `json:"update_seq_size"`
	LSN             uint64 `json:"lsn"`
	SequenceNumber  uint16 `json:"sequence_number"`
	HardLinkCount   uint16 `json:"hard_link_count"`
	FirstAttrOffset uint16 `json:"first_attr_offset"`
	Flags           uint16 `json:"flags"`
	RealSize        uint32 `json:"real_size"`
	AllocatedSize   uint32 `json:"allocated_size"`
	BaseFileRecord  uint64 `json:"base_file_record"`
	NextAttrId      uint16 `json:"next_attr_id"`
	RecordNumber    uint32 `json:"record_number"`

	// Standard Information attribute timestamps (if present)
	SI_Crtime string `json:"si_crtime,omitempty"`
	SI_Mtime  string `json:"si_mtime,omitempty"`
	SI_CtTime string `json:"si_ctime,omitempty"`
	SI_Atime  string `json:"si_atime,omitempty"`

	// Additional attributes
	ObjectID           string                 `json:"object_id,omitempty"`
	SecurityDescriptor map[string]interface{} `json:"security_descriptor,omitempty"`

	FileNames   []FileNameAttribute `json:"file_names,omitempty"`
	DataStreams []DataStream        `json:"data_streams,omitempty"`
}

// CarvedRecord holds a candidate 1024-byte record and its global offset.
type CarvedRecord struct {
	Offset      int64
	RecordBytes []byte
}

// ParsedRecord is what gets sent to the JSON writer.
type ParsedRecord struct {
	Offset     int64
	RecordJSON map[string]interface{}
}

// --------------------
// Helper functions
// --------------------

// safeSlice returns data[start : start+length] if within bounds; otherwise false.
func safeSlice(data []byte, start int, length int) ([]byte, bool) {
	if start < 0 || start+length > len(data) {
		return nil, false
	}
	return data[start : start+length], true
}

// filetimeToString converts a Windows FILETIME (uint64) into an RFC3339 timestamp string.
func filetimeToString(ft uint64) string {
	const epochDiff = 11644473600 // seconds between 1601 and 1970
	secs := int64(ft/10000000) - epochDiff
	nsec := int64(ft%10000000) * 100
	t := time.Unix(secs, nsec).UTC()
	return t.Format(time.RFC3339)
}

// decodeUTF16String converts little‑endian UTF‑16 bytes to a Go string.
func decodeUTF16String(b []byte) string {
	if len(b)%2 != 0 {
		b = b[:len(b)-1]
	}
	u16 := make([]uint16, len(b)/2)
	for i := 0; i < len(u16); i++ {
		u16[i] = binary.LittleEndian.Uint16(b[i*2:])
	}
	return string(utf16.Decode(u16))
}

// parseZoneIdentifier is a simple parser for Zone.Identifier streams.
func parseZoneIdentifier(content []byte) map[string]string {
	result := make(map[string]string)
	text := string(content)
	lines := strings.Split(text, "\n")
	for _, line := range lines {
		if parts := strings.SplitN(line, "=", 2); len(parts) == 2 {
			key := strings.TrimSpace(parts[0])
			value := strings.TrimSpace(parts[1])
			result[key] = value
		}
	}
	return result
}

// --------------------
// Attribute Parsing
// --------------------

// parseAttributes iterates over the attribute area and processes known types.
// It uses safeSlice to ensure we don’t read beyond the record.
func parseAttributes(data []byte, rec *MFTRecord) {
	offset := int(rec.FirstAttrOffset)
	for offset < len(data)-8 {
		// First 4 bytes: attribute type
		if attrBytes, ok := safeSlice(data, offset, 4); !ok {
			break
		} else {
			attrType := binary.LittleEndian.Uint32(attrBytes)
			// End marker
			if attrType == 0xFFFFFFFF {
				break
			}
			// Next 4 bytes: attribute length
			if attrLenBytes, ok := safeSlice(data, offset+4, 4); !ok {
				break
			} else {
				attrLen := binary.LittleEndian.Uint32(attrLenBytes)
				if attrLen < 8 || offset+int(attrLen) > len(data) {
					// Malformed attribute, skip one byte and try to re-sync.
					offset++
					continue
				}
				// Resident flag at offset+8.
				residentFlag := data[offset+8]
				// For resident attributes, extract value length and value offset.
				var valLen uint32
				var valOffset uint16
				if residentFlag == 0 {
					if vb, ok := safeSlice(data, offset+16, 4); ok {
						valLen = binary.LittleEndian.Uint32(vb)
					} else {
						offset += int(attrLen)
						continue
					}
					if vb, ok := safeSlice(data, offset+20, 2); ok {
						valOffset = binary.LittleEndian.Uint16(vb)
					} else {
						offset += int(attrLen)
						continue
					}
				}
				// Process known attribute types.
				switch attrType {
				case 0x10: // $STANDARD_INFORMATION
					if residentFlag == 0 {
						if siData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(siData) >= 32 {
							rec.SI_Crtime = filetimeToString(binary.LittleEndian.Uint64(siData[0:8]))
							rec.SI_Mtime = filetimeToString(binary.LittleEndian.Uint64(siData[8:16]))
							rec.SI_CtTime = filetimeToString(binary.LittleEndian.Uint64(siData[16:24]))
							rec.SI_Atime = filetimeToString(binary.LittleEndian.Uint64(siData[24:32]))
						}
					}
				case 0x30: // $FILE_NAME
					if residentFlag == 0 {
						if fnData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(fnData) >= 66 {
							var fn FileNameAttribute
							fn.ParentRef = binary.LittleEndian.Uint64(fnData[0:8]) & 0x0000FFFFFFFFFFFF
							fn.Crtime = filetimeToString(binary.LittleEndian.Uint64(fnData[8:16]))
							fn.Mtime = filetimeToString(binary.LittleEndian.Uint64(fnData[16:24]))
							fn.CtTime = filetimeToString(binary.LittleEndian.Uint64(fnData[24:32]))
							fn.Atime = filetimeToString(binary.LittleEndian.Uint64(fnData[32:40]))
							if filenameLenBytes, ok := safeSlice(fnData, 64, 1); ok {
								filenameLen := filenameLenBytes[0]
								if nameBytes, ok := safeSlice(fnData, 66, int(filenameLen)*2); ok {
									fn.Filename = decodeUTF16String(nameBytes)
								}
							}
							rec.FileNames = append(rec.FileNames, fn)
						}
					}
				case 0x80: // $DATA
					var ds DataStream
					// Extract the attribute's name if any.
					if nameInfo, ok := safeSlice(data, offset+9, 1); ok {
						nameLen := nameInfo[0]
						if nameBytes, ok := safeSlice(data, offset+10, int(nameLen)*2); ok && nameLen > 0 {
							ds.Name = decodeUTF16String(nameBytes)
						}
					}
					if residentFlag == 0 {
						ds.Resident = true
						ds.NonResident = false
						if content, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok {
							ds.ContentBase64 = base64.StdEncoding.EncodeToString(content)
							if ds.Name == "Zone.Identifier" {
								zoneInfo := parseZoneIdentifier(content)
								ds.Name = fmt.Sprintf("Zone.Identifier %v", zoneInfo)
							}
						}
					} else {
						ds.Resident = false
						ds.NonResident = true
					}
					rec.DataStreams = append(rec.DataStreams, ds)
				case 0x40: // $OBJECT_ID (when used as such)
					if residentFlag == 0 {
						if objData, ok := safeSlice(data, offset+int(valOffset), 16); ok {
							rec.ObjectID = fmt.Sprintf("%x", objData)
						}
					}
				case 0x50: // $SECURITY_DESCRIPTOR
					if residentFlag == 0 {
						// A minimal parser: extract the first 20 bytes fields if possible.
						if secData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(secData) >= 20 {
							// We could decode further; here we just store raw hex values.
							rec.SecurityDescriptor = map[string]interface{}{
								"raw": fmt.Sprintf("%x", secData),
							}
						}
					}
					// (Other attribute types such as $ATTRIBUTE_LIST, $VOLUME_NAME, etc.
					// can be added here following similar patterns.)
				}
				offset += int(attrLen)
			}
		}
	}
}

// parseMFTRecord attempts to parse a 1024-byte MFT record.
// It returns an error if the record is too short or if the expected "FILE" marker is missing.
func parseMFTRecord(data []byte) (*MFTRecord, error) {
	if len(data) < 46 {
		return nil, fmt.Errorf("data too short to be a valid record")
	}
	if string(data[:4]) != "FILE" {
		return nil, fmt.Errorf("invalid record header")
	}
	rec := &MFTRecord{
		UpdateSeqOffset: binary.LittleEndian.Uint16(data[4:6]),
		UpdateSeqSize:   binary.LittleEndian.Uint16(data[6:8]),
		LSN:             binary.LittleEndian.Uint64(data[8:16]),
		SequenceNumber:  binary.LittleEndian.Uint16(data[16:18]),
		HardLinkCount:   binary.LittleEndian.Uint16(data[18:20]),
		FirstAttrOffset: binary.LittleEndian.Uint16(data[20:22]),
		Flags:           binary.LittleEndian.Uint16(data[22:24]),
		RealSize:        binary.LittleEndian.Uint32(data[24:28]),
		AllocatedSize:   binary.LittleEndian.Uint32(data[28:32]),
		BaseFileRecord:  binary.LittleEndian.Uint64(data[32:40]),
		NextAttrId:      binary.LittleEndian.Uint16(data[40:42]),
		RecordNumber:    binary.LittleEndian.Uint32(data[42:46]),
	}
	parseAttributes(data, rec)
	return rec, nil
}

// --------------------
// Parallel Processing and Main
// --------------------

func processImageFile(inputFile string, wg *sync.WaitGroup) {
	defer wg.Done()

	f, err := os.Open(inputFile)
	if err != nil {
		log.Printf("Failed to open %s: %v", inputFile, err)
		return
	}
	defer f.Close()

	fi, err := f.Stat()
	if err != nil {
		log.Printf("Failed to stat %s: %v", inputFile, err)
		return
	}
	fileSize := fi.Size()

	// Create an output folder and JSONL file based on input file name and current timestamp.
	timestamp := time.Now().Format("20060102150405")
	baseName := filepath.Base(inputFile)
	outDir := fmt.Sprintf("%s_%s", baseName, timestamp)
	if err := os.Mkdir(outDir, 0755); err != nil {
		log.Printf("Failed to create output directory for %s: %v", inputFile, err)
		return
	}
	jsonlFileName := fmt.Sprintf("%s_%s.jsonl", baseName, timestamp)
	jsonlFile, err := os.Create(jsonlFileName)
	if err != nil {
		log.Printf("Failed to create JSONL file for %s: %v", inputFile, err)
		return
	}
	defer jsonlFile.Close()

	carvedChan := make(chan CarvedRecord, 100)
	parsedChan := make(chan ParsedRecord, 100)

	// Worker pool for carving/parsing records.
	numWorkers := runtime.NumCPU()
	var workerWg sync.WaitGroup
	for i := 0; i < numWorkers; i++ {
		workerWg.Add(1)
		go func() {
			defer workerWg.Done()
			for carved := range carvedChan {
				// Write raw record to disk.
				recordFileName := filepath.Join(outDir, fmt.Sprintf("%d.mftrecord", carved.Offset))
				if err := os.WriteFile(recordFileName, carved.RecordBytes, 0644); err != nil {
					log.Printf("Failed to write record at offset %d in %s: %v", carved.Offset, inputFile, err)
					continue
				}
				// Parse the record.
				mft, err := parseMFTRecord(carved.RecordBytes)
				if err != nil {
					// Skip records that cannot be parsed.
					continue
				}
				// Build JSON record (omitting the magic field).
				recordMap := map[string]interface{}{
					"input_image":         inputFile,
					"offset":              carved.Offset,
					"update_seq_offset":   mft.UpdateSeqOffset,
					"update_seq_size":     mft.UpdateSeqSize,
					"lsn":                 mft.LSN,
					"sequence_number":     mft.SequenceNumber,
					"hard_link_count":     mft.HardLinkCount,
					"first_attr_offset":   mft.FirstAttrOffset,
					"flags":               mft.Flags,
					"real_size":           mft.RealSize,
					"allocated_size":      mft.AllocatedSize,
					"base_file_record":    mft.BaseFileRecord,
					"next_attr_id":        mft.NextAttrId,
					"record_number":       mft.RecordNumber,
					"si_crtime":           mft.SI_Crtime,
					"si_mtime":            mft.SI_Mtime,
					"si_ctime":            mft.SI_CtTime,
					"si_atime":            mft.SI_Atime,
					"object_id":           mft.ObjectID,
					"security_descriptor": mft.SecurityDescriptor,
					"file_names":          mft.FileNames,
					"data_streams":        mft.DataStreams,
				}
				parsedChan <- ParsedRecord{Offset: carved.Offset, RecordJSON: recordMap}
			}
		}()
	}

	// Writer goroutine to output JSONL records.
	var writerWg sync.WaitGroup
	writerWg.Add(1)
	go func() {
		defer writerWg.Done()
		encoder := json.NewEncoder(jsonlFile)
		for pr := range parsedChan {
			if err := encoder.Encode(pr.RecordJSON); err != nil {
				log.Printf("Error writing JSON record at offset %d in %s: %v", pr.Offset, inputFile, err)
			}
		}
	}()

	// Scan the file for the "FILE0" pattern.
	reader := bufio.NewReader(f)
	const recordSize = 1024
	const chunkSize = 1024 * 1024
	pattern := []byte("FILE0")
	var fileOffset int64 = 0
	var leftover []byte
	lastPrint := time.Now()

	for {
		chunk := make([]byte, chunkSize)
		n, err := reader.Read(chunk)
		if n == 0 {
			break
		}
		data := append(leftover, chunk[:n]...)
		// Progress update every ~5 seconds.
		if time.Since(lastPrint) > 5*time.Second {
			perc := float64(fileOffset) / float64(fileSize) * 100.0
			log.Printf("Processing %s: %.2f%% complete", inputFile, perc)
			lastPrint = time.Now()
		}
		searchLimit := len(data) - len(pattern)
		for i := 0; i <= searchLimit; i++ {
			if bytes.Equal(data[i:i+len(pattern)], pattern) {
				globalOffset := fileOffset - int64(len(leftover)) + int64(i)
				if globalOffset+recordSize > fileSize {
					continue
				}
				recordBytes := make([]byte, recordSize)
				_, err := f.ReadAt(recordBytes, globalOffset)
				if err != nil {
					log.Printf("Failed to read record at offset %d in %s: %v", globalOffset, inputFile, err)
					continue
				}
				carvedChan <- CarvedRecord{Offset: globalOffset, RecordBytes: recordBytes}
			}
		}
		if len(data) >= len(pattern)-1 {
			leftover = data[len(data)-(len(pattern)-1):]
		} else {
			leftover = data
		}
		fileOffset += int64(n)
		if err == io.EOF {
			break
		}
	}

	close(carvedChan)
	workerWg.Wait()
	close(parsedChan)
	writerWg.Wait()
	log.Printf("Finished processing %s. Raw records are in %s and JSONL file is %s", inputFile, outDir, jsonlFileName)
}

func main() {
	flag.Parse()
	if flag.NArg() == 0 {
		fmt.Printf("Usage: %s <disk image files or directories>\n", os.Args[0])
		os.Exit(1)
	}

	// Build list of files from provided arguments (recursively if directories).
	var files []string
	for _, arg := range flag.Args() {
		fi, err := os.Stat(arg)
		if err != nil {
			log.Printf("Error stating %s: %v", arg, err)
			continue
		}
		if fi.IsDir() {
			err := filepath.Walk(arg, func(path string, info os.FileInfo, err error) error {
				if err != nil {
					return nil
				}
				if !info.IsDir() {
					files = append(files, path)
				}
				return nil
			})
			if err != nil {
				log.Printf("Error walking directory %s: %v", arg, err)
			}
		} else {
			files = append(files, arg)
		}
	}

	totalFiles := len(files)
	if totalFiles == 0 {
		log.Println("No input files found.")
		return
	}
	log.Printf("Found %d files to process.", totalFiles)

	var wg sync.WaitGroup
	concurrentFiles := runtime.NumCPU()
	sem := make(chan struct{}, concurrentFiles)
	for i, file := range files {
		wg.Add(1)
		sem <- struct{}{}
		go func(i int, file string) {
			defer func() { <-sem }()
			log.Printf("Starting file %d/%d: %s", i+1, totalFiles, file)
			processImageFile(file, &wg)
		}(i, file)
	}
	wg.Wait()
	log.Println("All files processed.")
}