深度学习之危险脚本识别

总结:没什么用!!! 你弄几千个webshell,一句话木马各种变种脚本看似很多很多但是在深度学习中几千个甚至是几万个样本完全沧海一粟跑完epoch全部是过拟合的结果

但是学习还是可以的!!!

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

class VulnerabilityDataset(Dataset):
    def __init__(self, data_list, tokenizer, max_length=512):
        self.data = data_list
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sensitive_keywords = {"exec", "system", "eval", "os", "subprocess"}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        code = str(item['code'])
        label = int(item['label'])

        encoding = self.tokenizer(
            code,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        focus_mask = torch.zeros_like(input_ids, dtype=torch.float)   
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

        for i, token in enumerate(tokens):
            clean_token = token.replace('Ġ', '')
            if clean_token in self.sensitive_keywords:
                focus_mask[i] = 1.0

        return {
            'input_ids': encoding['input_ids'].squeeze(), 
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long),
            'focus_mask': focus_mask
        }

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

raw_data = [
    {"code": "def safe_func(): print('hello')", "label": 0},
    {"code": "def vuln_func(): exec(user_input)", "label": 1},
    {"code": "x = 1 + 1", "label": 0},
    {"code": "import os; os.system('rm -rf /')", "label": 1} # 增加一个样本以适应 batch_size=2
]

model_name = "microsoft/graphcodebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = VulnerabilityDataset(raw_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2, 
    output_attentions=True,
    use_safetensors=True,
    attn_implementation="eager"
)

model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()
mse_loss_fn = torch.nn.MSELoss()

print("\n--- 开始训练 ---")
for epoch in range(3):
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        target_focus = batch['focus_mask'].to(device) 
        
        outputs = model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            labels=labels
        )
        
        cls_loss = outputs.loss
        last_layer_attn = outputs.attentions[-1] 
        cls_attention = last_layer_attn[:, :, 0, :]
        avg_cls_attention = torch.mean(cls_attention, dim=1)
        attn_loss = mse_loss_fn(avg_cls_attention, target_focus) 
        lambda_factor = 10.0
        loss = cls_loss + (lambda_factor * attn_loss)
        total_loss += loss.item()    
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch+1} | Total Loss: {loss.item():.4f} (Cls: {cls_loss.item():.4f}, Attn: {attn_loss.item():.4f})")