package main import ( "bufio" "bytes" "encoding/base64" "encoding/binary" "encoding/json" "flag" "fmt" "io" "log" "os" "path/filepath" "runtime" "strings" "sync" "time" "unicode/utf16" ) // -------------------- // Data Structures // -------------------- type FileNameAttribute struct { Filename string `json:"filename"` ParentRef uint64 `json:"parent_ref"` Crtime string `json:"crtime"` Mtime string `json:"mtime"` CtTime string `json:"ctime"` Atime string `json:"atime"` } type DataStream struct { Name string `json:"name"` Resident bool `json:"resident"` NonResident bool `json:"non_resident"` ContentBase64 string `json:"content_base64,omitempty"` } type MFTRecord struct { // Header fields (omitting the "magic" since it's always "FILE") UpdateSeqOffset uint16 `json:"update_seq_offset"` UpdateSeqSize uint16 `json:"update_seq_size"` LSN uint64 `json:"lsn"` SequenceNumber uint16 `json:"sequence_number"` HardLinkCount uint16 `json:"hard_link_count"` FirstAttrOffset uint16 `json:"first_attr_offset"` Flags uint16 `json:"flags"` RealSize uint32 `json:"real_size"` AllocatedSize uint32 `json:"allocated_size"` BaseFileRecord uint64 `json:"base_file_record"` NextAttrId uint16 `json:"next_attr_id"` RecordNumber uint32 `json:"record_number"` // Standard Information attribute timestamps (if present) SI_Crtime string `json:"si_crtime,omitempty"` SI_Mtime string `json:"si_mtime,omitempty"` SI_CtTime string `json:"si_ctime,omitempty"` SI_Atime string `json:"si_atime,omitempty"` // Additional attributes ObjectID string `json:"object_id,omitempty"` SecurityDescriptor map[string]interface{} `json:"security_descriptor,omitempty"` FileNames []FileNameAttribute `json:"file_names,omitempty"` DataStreams []DataStream `json:"data_streams,omitempty"` } // CarvedRecord holds a candidate 1024-byte record and its global offset. type CarvedRecord struct { Offset int64 RecordBytes []byte } // ParsedRecord is what gets sent to the JSON writer. type ParsedRecord struct { Offset int64 RecordJSON map[string]interface{} } // -------------------- // Helper functions // -------------------- // safeSlice returns data[start : start+length] if within bounds; otherwise false. func safeSlice(data []byte, start int, length int) ([]byte, bool) { if start < 0 || start+length > len(data) { return nil, false } return data[start : start+length], true } // filetimeToString converts a Windows FILETIME (uint64) into an RFC3339 timestamp string. func filetimeToString(ft uint64) string { const epochDiff = 11644473600 // seconds between 1601 and 1970 secs := int64(ft/10000000) - epochDiff nsec := int64(ft%10000000) * 100 t := time.Unix(secs, nsec).UTC() return t.Format(time.RFC3339) } // decodeUTF16String converts little‑endian UTF‑16 bytes to a Go string. func decodeUTF16String(b []byte) string { if len(b)%2 != 0 { b = b[:len(b)-1] } u16 := make([]uint16, len(b)/2) for i := 0; i < len(u16); i++ { u16[i] = binary.LittleEndian.Uint16(b[i*2:]) } return string(utf16.Decode(u16)) } // parseZoneIdentifier is a simple parser for Zone.Identifier streams. func parseZoneIdentifier(content []byte) map[string]string { result := make(map[string]string) text := string(content) lines := strings.Split(text, "\n") for _, line := range lines { if parts := strings.SplitN(line, "=", 2); len(parts) == 2 { key := strings.TrimSpace(parts[0]) value := strings.TrimSpace(parts[1]) result[key] = value } } return result } // -------------------- // Attribute Parsing // -------------------- // parseAttributes iterates over the attribute area and processes known types. // It uses safeSlice to ensure we don’t read beyond the record. func parseAttributes(data []byte, rec *MFTRecord) { offset := int(rec.FirstAttrOffset) for offset < len(data)-8 { // First 4 bytes: attribute type if attrBytes, ok := safeSlice(data, offset, 4); !ok { break } else { attrType := binary.LittleEndian.Uint32(attrBytes) // End marker if attrType == 0xFFFFFFFF { break } // Next 4 bytes: attribute length if attrLenBytes, ok := safeSlice(data, offset+4, 4); !ok { break } else { attrLen := binary.LittleEndian.Uint32(attrLenBytes) if attrLen < 8 || offset+int(attrLen) > len(data) { // Malformed attribute, skip one byte and try to re-sync. offset++ continue } // Resident flag at offset+8. residentFlag := data[offset+8] // For resident attributes, extract value length and value offset. var valLen uint32 var valOffset uint16 if residentFlag == 0 { if vb, ok := safeSlice(data, offset+16, 4); ok { valLen = binary.LittleEndian.Uint32(vb) } else { offset += int(attrLen) continue } if vb, ok := safeSlice(data, offset+20, 2); ok { valOffset = binary.LittleEndian.Uint16(vb) } else { offset += int(attrLen) continue } } // Process known attribute types. switch attrType { case 0x10: // $STANDARD_INFORMATION if residentFlag == 0 { if siData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(siData) >= 32 { rec.SI_Crtime = filetimeToString(binary.LittleEndian.Uint64(siData[0:8])) rec.SI_Mtime = filetimeToString(binary.LittleEndian.Uint64(siData[8:16])) rec.SI_CtTime = filetimeToString(binary.LittleEndian.Uint64(siData[16:24])) rec.SI_Atime = filetimeToString(binary.LittleEndian.Uint64(siData[24:32])) } } case 0x30: // $FILE_NAME if residentFlag == 0 { if fnData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(fnData) >= 66 { var fn FileNameAttribute fn.ParentRef = binary.LittleEndian.Uint64(fnData[0:8]) & 0x0000FFFFFFFFFFFF fn.Crtime = filetimeToString(binary.LittleEndian.Uint64(fnData[8:16])) fn.Mtime = filetimeToString(binary.LittleEndian.Uint64(fnData[16:24])) fn.CtTime = filetimeToString(binary.LittleEndian.Uint64(fnData[24:32])) fn.Atime = filetimeToString(binary.LittleEndian.Uint64(fnData[32:40])) if filenameLenBytes, ok := safeSlice(fnData, 64, 1); ok { filenameLen := filenameLenBytes[0] if nameBytes, ok := safeSlice(fnData, 66, int(filenameLen)*2); ok { fn.Filename = decodeUTF16String(nameBytes) } } rec.FileNames = append(rec.FileNames, fn) } } case 0x80: // $DATA var ds DataStream // Extract the attribute's name if any. if nameInfo, ok := safeSlice(data, offset+9, 1); ok { nameLen := nameInfo[0] if nameBytes, ok := safeSlice(data, offset+10, int(nameLen)*2); ok && nameLen > 0 { ds.Name = decodeUTF16String(nameBytes) } } if residentFlag == 0 { ds.Resident = true ds.NonResident = false if content, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok { ds.ContentBase64 = base64.StdEncoding.EncodeToString(content) if ds.Name == "Zone.Identifier" { zoneInfo := parseZoneIdentifier(content) ds.Name = fmt.Sprintf("Zone.Identifier %v", zoneInfo) } } } else { ds.Resident = false ds.NonResident = true } rec.DataStreams = append(rec.DataStreams, ds) case 0x40: // $OBJECT_ID (when used as such) if residentFlag == 0 { if objData, ok := safeSlice(data, offset+int(valOffset), 16); ok { rec.ObjectID = fmt.Sprintf("%x", objData) } } case 0x50: // $SECURITY_DESCRIPTOR if residentFlag == 0 { // A minimal parser: extract the first 20 bytes fields if possible. if secData, ok := safeSlice(data, offset+int(valOffset), int(valLen)); ok && len(secData) >= 20 { // We could decode further; here we just store raw hex values. rec.SecurityDescriptor = map[string]interface{}{ "raw": fmt.Sprintf("%x", secData), } } } // (Other attribute types such as $ATTRIBUTE_LIST, $VOLUME_NAME, etc. // can be added here following similar patterns.) } offset += int(attrLen) } } } } // parseMFTRecord attempts to parse a 1024-byte MFT record. // It returns an error if the record is too short or if the expected "FILE" marker is missing. func parseMFTRecord(data []byte) (*MFTRecord, error) { if len(data) < 46 { return nil, fmt.Errorf("data too short to be a valid record") } if string(data[:4]) != "FILE" { return nil, fmt.Errorf("invalid record header") } rec := &MFTRecord{ UpdateSeqOffset: binary.LittleEndian.Uint16(data[4:6]), UpdateSeqSize: binary.LittleEndian.Uint16(data[6:8]), LSN: binary.LittleEndian.Uint64(data[8:16]), SequenceNumber: binary.LittleEndian.Uint16(data[16:18]), HardLinkCount: binary.LittleEndian.Uint16(data[18:20]), FirstAttrOffset: binary.LittleEndian.Uint16(data[20:22]), Flags: binary.LittleEndian.Uint16(data[22:24]), RealSize: binary.LittleEndian.Uint32(data[24:28]), AllocatedSize: binary.LittleEndian.Uint32(data[28:32]), BaseFileRecord: binary.LittleEndian.Uint64(data[32:40]), NextAttrId: binary.LittleEndian.Uint16(data[40:42]), RecordNumber: binary.LittleEndian.Uint32(data[42:46]), } parseAttributes(data, rec) return rec, nil } // -------------------- // Parallel Processing and Main // -------------------- func processImageFile(inputFile string, wg *sync.WaitGroup) { defer wg.Done() f, err := os.Open(inputFile) if err != nil { log.Printf("Failed to open %s: %v", inputFile, err) return } defer f.Close() fi, err := f.Stat() if err != nil { log.Printf("Failed to stat %s: %v", inputFile, err) return } fileSize := fi.Size() // Create an output folder and JSONL file based on input file name and current timestamp. timestamp := time.Now().Format("20060102150405") baseName := filepath.Base(inputFile) outDir := fmt.Sprintf("%s_%s", baseName, timestamp) if err := os.Mkdir(outDir, 0755); err != nil { log.Printf("Failed to create output directory for %s: %v", inputFile, err) return } jsonlFileName := fmt.Sprintf("%s_%s.jsonl", baseName, timestamp) jsonlFile, err := os.Create(jsonlFileName) if err != nil { log.Printf("Failed to create JSONL file for %s: %v", inputFile, err) return } defer jsonlFile.Close() carvedChan := make(chan CarvedRecord, 100) parsedChan := make(chan ParsedRecord, 100) // Worker pool for carving/parsing records. numWorkers := runtime.NumCPU() var workerWg sync.WaitGroup for i := 0; i < numWorkers; i++ { workerWg.Add(1) go func() { defer workerWg.Done() for carved := range carvedChan { // Write raw record to disk. recordFileName := filepath.Join(outDir, fmt.Sprintf("%d.mftrecord", carved.Offset)) if err := os.WriteFile(recordFileName, carved.RecordBytes, 0644); err != nil { log.Printf("Failed to write record at offset %d in %s: %v", carved.Offset, inputFile, err) continue } // Parse the record. mft, err := parseMFTRecord(carved.RecordBytes) if err != nil { // Skip records that cannot be parsed. continue } // Build JSON record (omitting the magic field). recordMap := map[string]interface{}{ "input_image": inputFile, "offset": carved.Offset, "update_seq_offset": mft.UpdateSeqOffset, "update_seq_size": mft.UpdateSeqSize, "lsn": mft.LSN, "sequence_number": mft.SequenceNumber, "hard_link_count": mft.HardLinkCount, "first_attr_offset": mft.FirstAttrOffset, "flags": mft.Flags, "real_size": mft.RealSize, "allocated_size": mft.AllocatedSize, "base_file_record": mft.BaseFileRecord, "next_attr_id": mft.NextAttrId, "record_number": mft.RecordNumber, "si_crtime": mft.SI_Crtime, "si_mtime": mft.SI_Mtime, "si_ctime": mft.SI_CtTime, "si_atime": mft.SI_Atime, "object_id": mft.ObjectID, "security_descriptor": mft.SecurityDescriptor, "file_names": mft.FileNames, "data_streams": mft.DataStreams, } parsedChan <- ParsedRecord{Offset: carved.Offset, RecordJSON: recordMap} } }() } // Writer goroutine to output JSONL records. var writerWg sync.WaitGroup writerWg.Add(1) go func() { defer writerWg.Done() encoder := json.NewEncoder(jsonlFile) for pr := range parsedChan { if err := encoder.Encode(pr.RecordJSON); err != nil { log.Printf("Error writing JSON record at offset %d in %s: %v", pr.Offset, inputFile, err) } } }() // Scan the file for the "FILE0" pattern. reader := bufio.NewReader(f) const recordSize = 1024 const chunkSize = 1024 * 1024 pattern := []byte("FILE0") var fileOffset int64 = 0 var leftover []byte lastPrint := time.Now() for { chunk := make([]byte, chunkSize) n, err := reader.Read(chunk) if n == 0 { break } data := append(leftover, chunk[:n]...) // Progress update every ~5 seconds. if time.Since(lastPrint) > 5*time.Second { perc := float64(fileOffset) / float64(fileSize) * 100.0 log.Printf("Processing %s: %.2f%% complete", inputFile, perc) lastPrint = time.Now() } searchLimit := len(data) - len(pattern) for i := 0; i <= searchLimit; i++ { if bytes.Equal(data[i:i+len(pattern)], pattern) { globalOffset := fileOffset - int64(len(leftover)) + int64(i) if globalOffset+recordSize > fileSize { continue } recordBytes := make([]byte, recordSize) _, err := f.ReadAt(recordBytes, globalOffset) if err != nil { log.Printf("Failed to read record at offset %d in %s: %v", globalOffset, inputFile, err) continue } carvedChan <- CarvedRecord{Offset: globalOffset, RecordBytes: recordBytes} } } if len(data) >= len(pattern)-1 { leftover = data[len(data)-(len(pattern)-1):] } else { leftover = data } fileOffset += int64(n) if err == io.EOF { break } } close(carvedChan) workerWg.Wait() close(parsedChan) writerWg.Wait() log.Printf("Finished processing %s. Raw records are in %s and JSONL file is %s", inputFile, outDir, jsonlFileName) } func main() { flag.Parse() if flag.NArg() == 0 { fmt.Printf("Usage: %s \n", os.Args[0]) os.Exit(1) } // Build list of files from provided arguments (recursively if directories). var files []string for _, arg := range flag.Args() { fi, err := os.Stat(arg) if err != nil { log.Printf("Error stating %s: %v", arg, err) continue } if fi.IsDir() { err := filepath.Walk(arg, func(path string, info os.FileInfo, err error) error { if err != nil { return nil } if !info.IsDir() { files = append(files, path) } return nil }) if err != nil { log.Printf("Error walking directory %s: %v", arg, err) } } else { files = append(files, arg) } } totalFiles := len(files) if totalFiles == 0 { log.Println("No input files found.") return } log.Printf("Found %d files to process.", totalFiles) var wg sync.WaitGroup concurrentFiles := runtime.NumCPU() sem := make(chan struct{}, concurrentFiles) for i, file := range files { wg.Add(1) sem <- struct{}{} go func(i int, file string) { defer func() { <-sem }() log.Printf("Starting file %d/%d: %s", i+1, totalFiles, file) processImageFile(file, &wg) }(i, file) } wg.Wait() log.Println("All files processed.") }