学习笔记pirate

简介: - 使用Python的`sklearn`进行数据预处理,包括AdaBoost回归器的网格搜索调优,处理时间序列数据,并执行数据可视化。- 应用`transformers`库对预训练的语言模型进行微调,针对RTE、MRPC和SST-2任务进行文本分类,使用PEFT(Pointer Enhanced Fine-Tuning)模型。- 进行图像分割任务,包括图像预处理、定义数据集、训练DeepLabV3 ResNet50模型。

Getting started

pip install jupyterlab
启动jupyterlab服务
jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root


pip install transformers
datasets
pip install opencv-python
pip install jupyter
torchvision
torch
torchmetrics
scikit-learn
peft
ipykernel
pip install seaborn
pip install imbalanced-learn
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

PD_SK

from sklearn import ensemble,preprocessing,model_selection,metrics
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import random
ord('A')
chr(65)
plt.pie(s['digit'],labels=s.index,autopct='%4.2f%%',startangle=90,explode=[0,0,0,0.2],shadow=True,colors=['orange','lightgreen','lightblue','pink'])
r = plt.bar(s.index,s['digit'],color='pink',label='pnn')
plt.bar(s.index,s['digit'],bottom=s['digit'],color='lightgreen',label='grn')
plt.legend()
GDB = ensemble.AdaBoostRegressor()
params = {'n_estimators':range(38,42,2),
          'learning_rate':np.linspace(0.2,1.8,4)
          }
gs = model_selection.GridSearchCV(GDB,params,cv=3)

data = pd.read_csv('Alice_Springs_2019.csv',)
data = data[~data.isna().any(axis=1)]
data.reset_index(inplace=True,drop=True)
data['timestamp']= pd.to_datetime(data['timestamp'],errors='coerce')
data['timestamp'][0].strftime('%Y-%m-%d')
data['timestamp'][0].day
data['Year'] = data['timestamp'].apply(lambda x:x.year)
data['Month'] = data['timestamp'].apply(lambda x:x.month)
data['Day'] = data['timestamp'].apply(lambda x:x.day)
data['Hour'] = data['timestamp'].apply(lambda x:x.day)
data['Day'] = data['timestamp'].apply(lambda x:x.day)

pd_data = data.drop(columns=['timestamp'])
ss_feature = preprocessing.StandardScaler()
ss_ap = preprocessing.StandardScaler()
pd_data.iloc[:,1:7] = ss_feature.fit_transform(pd_data.iloc[:,1:7]) 
pd_data['Active_Power']= ss_ap.fit_transform(pd.DataFrame(pd_data['Active_Power']))

X = pd_data.iloc[:,1:]
Y = pd_data['Active_Power']
gs.fit(X,Y)

import joblib
import pickle

joblib.dump(gs.best_estimator_,'./adaboost.est')
model = joblib.load('./adaboost.est')
metric = metrics.r2_score
pred = model.predict(X)
metric(Y,pred)
tx,ex,ty,ey = model_selection.train_test_split(X,Y,test_size=0.2)
from imblearn.over_sampling import RandomOverSampler

metric(ey,gs.predict(ex))
gs.best_score_

tx,ex,ty,ey = model_selection.train_test_split(X,Y,test_size=0.2)
gs.fit(tx,ty)
gs.best_score_

pd_data.groupby(['Year','Month']).agg({'Active_Power':lambda x:str(x.idxmax())+' '+str( x.max())})

btm,bin,_ = plt.hist(pd_data['Active_Power'],color='lightblue',label='standered',density=True)
plt.hist(data['Active_Power'],bottom=btm,bins=bin,color='pink',label='origin',density=True)
plt.legend()
plt.xlabel('power')
plt.ylabel('num')
plt.title('Hist')
plt.savefig('hist.jpg',dpi=300,format='jpg')

import seaborn as sns
cormatrix = data.iloc[:,1:].corr()
cormatrix['Active_Power'][1:]
sns.heatmap(cormatrix,cmap='twilight_r')

sns.kdeplot(data['Active_Power'])
plt.hist(data['Active_Power'],color='pink',label='origin',density=True)

sns.scatterplot(x=data['Global_Horizontal_Radiation'],y=data['Active_Power'])

plt.cm.PuRd
plt.scatter(x=data['Global_Horizontal_Radiation'],y=data['Active_Power'],c=data['Active_Power'],linewidths=0.1,edgecolors='black',alpha=0.8,cmap='tab20_r')
plt.colorbar()

sns.regplot(x=data['Global_Horizontal_Radiation'],y=data['Active_Power'],line_kws={'color':'red'})

plt.figure(figsize=(24,5))
plt.plot_date(x=data['timestamp'][:5000],y=data['Active_Power'][:5000],fmt='-o',color='red')


plt.figure(figsize=(24,5))
plt.plot(data['Active_Power'][:5000])

rst = plt.boxplot(data['Active_Power'])
plt.show()


diff = data['Active_Power'].quantile(0.75)-data['Active_Power'].quantile(0.25)
up_bound  = data['Active_Power'].median()+diff 
low_bound  = data['Active_Power'].median()-diff 
data[(data['Active_Power']<low_bound) | (data['Active_Power']>up_bound)]

LLM

RTE

import torch
from transformers import AutoModelForSequenceClassification,AutoTokenizer,trainer,training_args,Trainer,TrainingArguments,utils
from datasets import Dataset
from peft import LoraConfig,get_peft_model,prepare_model_for_kbit_training,PeftModel,TaskType
import pandas as pd
import os

llm_model_path = "C:/gemma-2b-it"
label_list = ['entailment', 'not_entailment']

tknizer = AutoTokenizer.from_pretrained(llm_model_path)
def data_tk_fn(item):


    return tknizer(item['sentence'],return_tensors='pt',padding='max_length',max_length=100,truncation=True)
def merge_sentence(item):
    s1= ' '.join(item['sentence1'].split()[:80])
    s2 = ' '.join(item['sentence2'].split()[:20])
    sentence = f'sentence1:{s1}\nsentence2:{s2}'
    return sentence

def load_data(data_path):
    df_data = pd.read_csv(data_path,sep='\t',index_col='index')
    df_data.dropna(axis=0,ignore_index=True,inplace=True)
    df_data.label = df_data['label'].apply(lambda x: label_list.index(x))
    df_data = df_data.sample(frac=1,ignore_index=True)
    df_data.rename(columns={'label':'labels'},inplace=True)
    df_data['sentence'] = df_data.apply(merge_sentence,axis=1)
    ds = Dataset.from_pandas(df_data)

    return ds.map(function=data_tk_fn,batched=True).remove_columns(['sentence1', 'sentence2','sentence'])

train_path = 'train.tsv'
data_path = train_path
df_data = pd.read_csv(data_path,sep='\t',index_col='index')
df_data.dropna(axis=0,ignore_index=True,inplace=True)
df_data.label = df_data['label'].apply(lambda x: label_list.index(x))
df_data = df_data.sample(frac=1,ignore_index=True)
df_data.rename(columns={'label':'labels'},inplace=True)
df_data['sentence'] = df_data.apply(merge_sentence,axis=1)
ds = Dataset.from_pandas(df_data)
ds.map(function=data_tk_fn,batched=True).remove_columns(['sentence1', 'sentence2','sentence'])
df_data['s1_words'] = df_data['sentence1'].apply(lambda x:len(x.split()))
df_data['s2_words'] = df_data['sentence2'].apply(lambda x:len(x.split()))

df_data['s1_words'].describe()
df_data['s1_words'].hist()

df_data['s2_words'].describe()
df_data['s2_words'].hist()
train_ds = load_data(train_path)
dev_ds = load_data('dev.tsv')

llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path,num_labels =2 )
lora_config = LoraConfig(task_type=TaskType.SEQ_CLS,
                         r =3,
                         inference_mode = False,
                         target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'])
llm_model = prepare_model_for_kbit_training(llm_model)
peft_model = get_peft_model(llm_model,lora_config)
peft_model.print_trainable_parameters()
train_arg = TrainingArguments(output_dir='./output/',
                              eval_strategy='steps',
                              logging_first_step=True,
                              logging_steps=10,
                              log_level='info',
                              logging_strategy='steps',
                              save_strategy='epoch',
                              per_device_train_batch_size=2)
train_arg.batch_eval_metrics

from torchmetrics.classification import Accuracy
from transformers import trainer_utils
import numpy as np
from transformers.data import data_collator

acc_metric = Accuracy(task='multiclass',num_classes=2,top_k = 1)
def metric(eval_preds):
    predictions = eval_preds.predictions
    label_ids = eval_preds.label_ids

    pred = predictions.argmax(axis=-1)

    acc = acc_metric(torch.Tensor(pred),torch.Tensor(label_ids))
    return {'Accuracy':acc}

trainer = Trainer(peft_model,train_arg,train_dataset=train_ds,eval_dataset=dev_ds,compute_metrics=metric)
trainer.train()

MRPC

#引用包
import pandas as pd
import numpy as np
import torch
from transformers import TrainingArguments,AutoModelForSequenceClassification,AutoTokenizer,Trainer
from peft import LoraModel,LoraConfig,prepare_model_for_kbit_training,TaskType,get_peft_model,PeftModel
from datasets import Dataset
from transformers import trainer_utils

def forge_text(data_row):
    string1 = data_row['#1 String']
    string2 = data_row['#2 String']
    forged_string = \
        f'String_1:{string1}\nString_2:{string2}'

    return forged_string

# gemma_tokenizer = AutoTokenizer.from_pretrained('D:/gemma-transformers-2b-it-v3')
gemma_tokenizer = AutoTokenizer.from_pretrained('C:/gemma-2b-it')
def ds_tokenize(item):
    return gemma_tokenizer(item['text'],padding='max_length',max_length=96,truncation=True,return_tensors='pt').to('cuda')
def load_data(tsv_path):

    data = pd.read_csv(tsv_path,sep='\t',on_bad_lines='skip').sample(frac=0.5)
    data['text'] = data.apply(forge_text,axis=1)
    if 'Quality' in data.columns:
        data_dict = {'text':data['text'],'labels':data['Quality']}

    else:
        data_dict = {'text':data['text']}
    ds = Dataset.from_dict(data_dict)
    return ds.map(ds_tokenize,batched=True).remove_columns('text')
train_ds = load_data('./train.tsv')
dev_ds = load_data('./dev.tsv')
test_ds = load_data('./test.tsv')
gemma_tokenizer('hello world',return_tensors='pt')

# gemma_model = AutoModelForSequenceClassification.from_pretrained('D:/gemma-transformers-2b-it-v3',num_labels=2,device_map='cuda')
gemma_model = AutoModelForSequenceClassification.from_pretrained('D:/gemma-transformers-2b-it-v3',num_labels=2)
gemma_model
config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
    inference_mode=False,
    r = 4
    )

model = prepare_model_for_kbit_training(gemma_model)
model = get_peft_model(model,config)
model.print_trainable_parameters()
train_args = TrainingArguments(
    output_dir='./output/',
    auto_find_batch_size=True,
    learning_rate=1e-4,
    num_train_epochs=5,
    logging_dir='./log/'
)
trainner = Trainer(
    model,
    train_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    compute_metrics=metric
    )
trainner.train()

SST-2

import pandas as pd
import torch
import peft
import transformers
import torchmetrics
import tqdm

file_path = './train.tsv'
pd_data = pd.read_csv(file_path,sep='\t')
pd_data.rename(columns={
   'label':'labels'},inplace=True)
pd_data.columns

from transformers import AutoModelForSequenceClassification,AutoTokenizer,Trainer,TrainingArguments
from datasets import Dataset

llm = AutoModelForSequenceClassification.from_pretrained('C:/gemma-2b-it/',num_labels=2)
tknizer = AutoTokenizer.from_pretrained('C:/gemma-2b-it/')
llm

def ds_map_fn(item):
    return tknizer(item['sentence'],truncation=transformers.tokenization_utils_base.TruncationStrategy.LONGEST_FIRST,padding=transformers.utils.generic.PaddingStrategy.LONGEST,return_tensors='pt')


file_path = './train.tsv'
def get_dataset(file_path,sample=1.0):
    pd_data = pd.read_csv(file_path,sep='\t')
    pd_data.rename(columns={
   'label':'labels'},inplace=True)
    pd_data = pd_data.sample(frac=sample)
    ds = Dataset.from_pandas(pd_data)
    # ds = ds.map(ds_map_fn,batched=True).remove_columns(['sentence'])
    return ds
ds = get_dataset(file_path)

# input_ids = torch.tensor(ds[:5]['input_ids'],dtype=torch.int)
# attention_mask = torch.tensor(ds[:5]['attention_mask'],dtype=torch.int)
# labels = torch.tensor(ds[:5]['labels'],dtype=torch.long)

from torch.utils.data import DataLoader

# dl = DataLoader(ds,batch_size=5)
# for item in dl:

#     labels = torch.tensor(item['labels'],dtype=torch.long)
#     attention_mask = torch.stack(item['attention_mask'],dim=1)
#     input_ids= torch.stack(item['input_ids'],dim=1)
#     print(tknizer.decode(input_ids[0]))
#     break
# rst = llm.forward(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
# rst.loss

from peft import LoraConfig,prepare_model_for_kbit_training,get_peft_model
lora_config = LoraConfig(task_type=peft.utils.peft_types.TaskType.SEQ_CLS,
                        r=4,
                        target_modules=['k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'])
peft_model = prepare_model_for_kbit_training(llm)
peft_model = get_peft_model(peft_model,lora_config)
peft_model.print_trainable_parameters()

# train_arg = TrainingArguments(output_dir='./sst/')
# trainer = Trainer(peft_model,args=train_arg,train_dataset=ds)
# trainer.train()

Epoch = 3
Batchsize = 10
lr = 5e-4
opt = torch.optim.AdamW(params=peft_model.parameters(),lr=lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=opt,step_size=3,gamma=0.8)
metric = torchmetrics.Accuracy('multiclass',num_classes=2)


train_ds = get_dataset('./train.tsv')
eval_ds = get_dataset('./dev.tsv')
train_dl = DataLoader(dataset=train_ds,batch_size=Batchsize)
eval_dl = DataLoader(dataset=eval_ds,batch_size=Batchsize)

def eval(metric,epoch,peft_model,eval_dl):
    with torch.no_grad():
        total_loss = 0
        peft_model.eval()
        t_bar = tqdm(eval_dl,position=0)
        t_bar.set_description(f'Eval Epoch:{epoch}')
        batch = 0
        metric.reset()
        metric.to('cuda')
        for item in t_bar:
            labels = item['labels'].long().to('cuda')
            sentence = item['sentence']
            tk_rst = tknizer(sentence,padding=True,truncation=True,return_tensors='pt').to('cuda')
            tk_rst['labels'] = labels
            rst = peft_model.forward(**tk_rst)

            loss = rst.loss

            total_loss = total_loss+float(loss)
            epoch_avg_loss = total_loss/(batch+1)

            logits = rst.logits
            pred = logits.argmax(-1)


            batch_acc = metric(pred,labels)
            avg_acc = metric.compute()

            t_bar.set_postfix({
   'Loss':float(loss),'AVG_LOSS':epoch_avg_loss,'acc':float(batch_acc),'AVG_acc':float(avg_acc)})
            batch = batch +1
        peft_model.train()
        return epoch_avg_loss

# train
def train(peft_model,Epoch,train_dl,eval_dl,opt,lr_scheduler,metric):
    peft_model.cuda()
    best_eval_loss = 1000
    for epoch in range(Epoch):
        total_loss = 0
        peft_model.train()
        t_bar = tqdm(train_dl,position=0)
        t_bar.set_description(f'Train Epoch:{epoch:4d}')
        batch = 0
        for item in t_bar:
            labels = item['labels'].long().to('cuda')
            sentence = item['sentence']
            tk_rst = tknizer(sentence,padding=True,truncation=True,return_tensors='pt').to('cuda')
            tk_rst['labels'] = labels
            # attention_mask = torch.stack(item['attention_mask'],dim=1).to('cuda')
            # input_ids= torch.stack(item['input_ids'],dim=1).to('cuda')
            # rst = peft_model.forward(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
            rst = peft_model.forward(**tk_rst)
            loss = rst.loss

            loss.backward()
            opt.step()
            opt.zero_grad()

            total_loss = total_loss+float(loss)
            epoch_avg_loss = total_loss/(batch+1)
            t_bar.set_postfix({
   'BatchLoss':float(loss),'Epoch_AVG_LOSS':epoch_avg_loss})

            if batch%500 == 0:
                eval_loss = eval(metric,epoch,peft_model,eval_dl)
                lr_scheduler.step()
                if eval_loss < best_eval_loss:
                    peft_model.save_pretrained(f'./sst_gemma_best_{epoch}_{batch}')
                    best_eval_loss = eval_loss
            batch = batch +1



train(peft_model,Epoch,train_dl,eval_dl,opt,lr_scheduler,metric)
peft_model.save_pretrained('./sst_gemma')

from peft import PeftModel
loaded_model = PeftModel.from_pretrained(llm,'./sst_gemma',is_trainable=True)
loaded_model.print_trainable_parameters()

test_d = pd.read_csv('test.tsv',sep='\t',index_col='index')
sentences = test_d['sentence'].tolist()

tk_rst = tknizer(sentences,truncation=True,padding=True,return_tensors='pt')

tk_rst.keys()
tk_rst['input_ids'],tk_rst['attention_mask']
loaded_model.forward(input_ids=tk_rst['input_ids'])

IMG_SEG

import cv2
import os
import numpy as np
import pandas as pd
import json
from matplotlib import pyplot as plt
import torchvision as v
import torch
import random

img_dir = './JPEGImages'
anno_dir = './labelme_anno'
OP_H = 540
OP_W = 960
anno_filelist = os.listdir(anno_dir)
anno_file = anno_filelist[0]
print(anno_file)
Mode = 'Train'


def get_img_anno(anno_file,OP_H,OP_W,Mode='Train'):

    with open(os.path.join(anno_dir,anno_file)) as f:
        anno_info = json.load(f)

    img_h = anno_info['imageHeight']
    img_w = anno_info['imageWidth']
    img_path = anno_info['imagePath']
    img_path = img_path[1:]
    img_shapes = anno_info['shapes']
    regions = []


    img = cv2.imread(img_path)#h,w,c
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)

    randomcrop = v.transforms.RandomCrop((OP_H,OP_W))
    resize = v.transforms.Resize((OP_H,OP_W),interpolation=T.InterpolationMode.NEAREST)
    to_tensor = v.transforms.ToTensor()
    to_PilImg = v.transforms.ToPILImage()
    color_jitter= v.transforms.ColorJitter(0.2,0.2,0.2)
    img_tensor = to_tensor(img)

    #20%概率不裁剪
    if random.random()<0.2 or Mode != 'Train':
        img_tensor = resize(img_tensor)

        if Mode == 'Train':
            img_tensor = color_jitter(img_tensor)

        for shape in img_shapes:
            points = shape['points']
            for i in range(len(points)):
                points[i][0] = points[i][0]/img_w*OP_W
                points[i][1] = points[i][1]/img_h*OP_H
        regions.append(np.array(points,np.int32))
        label_img = cv2.fillPoly(np.zeros((OP_H,OP_W)),regions,color=1)
        label_img = to_tensor(label_img)

    #裁剪
    else:
        img_tensor = color_jitter(img_tensor)
        for shape in img_shapes:
            points = shape['points']
            regions.append(np.array(points,np.int32))
        label_img = cv2.fillPoly(np.zeros((img_h,img_w)),regions,color=1)
        label_img = to_tensor(label_img)
        img_all = torch.concat((img_tensor,label_img),dim=0)

        cut_ok = 0
        cut_times = 0
        while(cut_ok == 0 ):
            img_all_cut = randomcrop(img_all)
            cut_img = img_all_cut[0:-1,:,:]
            cut_label = img_all_cut[-1,:,:]
            cut_times = cut_times+1
            if cut_label.sum()/label_img.sum() > 0.6 or cut_times>500:
                cut_ok = 1 
                label_img=cut_label
                img_tensor = cut_img
    return img_tensor,label_img.squeeze()


img,label = get_img_anno(anno_file,OP_H,OP_W)
topil = v.transforms.ToPILImage()

label == 1
img*label+(label == 0)#背景白色
s = img.clone()
rgb_mask = torch.zeros((3,540,960))
rgb_mask[0] = label
rgb_mask[1]  = label
merged = s+rgb_mask*0.2
merged = merged.clip(0,1)
topil(merged)
topil(label)

anno_filelist = os.listdir(anno_dir)
random.shuffle(anno_filelist)
train_anno = anno_filelist[0:100]
eval_anno = anno_filelist[100:]

import torch.utils
import torch.utils.data


class IMG_SEG_DS(torch.utils.data.Dataset):
    def __init__(self,anno_filelist,Mode= 'Train',OP_H=540,OP_W=960) -> None:
        super().__init__()


        self.anno_filelist = anno_filelist

        self.OP_H = OP_H
        self.OP_W = OP_W
        self.Mode = Mode

    def __getitem__(self, idx):
        anno_file = self.anno_filelist[idx]
        img,label = get_img_anno(anno_file,self.OP_H,self.OP_W,self.Mode)
        return img,label
    def __len__(self):
        return len(self.anno_filelist[0:100])

train_ds = IMG_SEG_DS(train_anno)
eval_ds = IMG_SEG_DS(eval_anno,Mode='eval')

class IMG_SEG(torch.nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
    def forward(self,x):
        return x

model = v.models.segmentation.deeplabv3_resnet50(pretrained = False,pretrained_backbone=False,num_classes=2).cuda()

#超参数
Epoch = 45
Batchsize = 2
lr = 5e-4
opt = torch.optim.AdamW(params=model.parameters(),lr=lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=opt,step_size=3,gamma=0.6)
loss_fn = torch.nn.CrossEntropyLoss()


train_dl = torch.utils.data.DataLoader(train_ds,Batchsize,shuffle=True)
eval_dl = torch.utils.data.DataLoader(eval_ds,batch_size=2)


from tqdm import tqdm
import time

def Evaluate(model,eval_dl,Epoch,):
    model.eval()
    with torch.no_grad():
        tqdm_bar = tqdm(eval_dl)
        tqdm_bar.set_description(desc=f'Evaluation in Epoch:{Epoch}',refresh=True)
        total_loss = 0
        for j,item in enumerate(tqdm_bar):
            img,label = item   
            img = img.to('cuda',torch.float32)
            label = label.to('cuda',torch.long)

            pred_logits = model(img)        
            loss = loss_fn(pred_logits['out'],label)


            batch_loss = float(loss.cpu())

            avg_loss = total_loss/(j+1)
            total_loss = total_loss + batch_loss

            tqdm_bar.set_postfix({
   'Eval_Avg_loss': avg_loss})





def Train(model,train_dl,eval_dl,Epoch):


    for i in range(Epoch):
        model.train()
        tqdm_bar = tqdm(train_dl)
        tqdm_bar.set_description(desc=f'Train Epoch:{i}',refresh=True)
        total_loss = 0
        for j,item in enumerate(tqdm_bar):
            img,label = item   
            img = img.to('cuda',torch.float32)
            label = label.to('cuda',torch.long)

            pred_logits = model(img)        
            loss = loss_fn(pred_logits['out'],label)

            loss.backward()
            opt.step()
            opt.zero_grad()

            batch_loss = float(loss.cpu())

            avg_loss = total_loss/(j+1)
            total_loss = total_loss + batch_loss

            tqdm_bar.set_postfix({
   'Batch_Loss':batch_loss,'Epoch_avg_loss': avg_loss})

        lr_scheduler.step()
        Evaluate(model,eval_dl,i)

Train(model,train_dl,eval_dl,Epoch)

model.eval()
img,label = eval_ds[5]
rst = model(img.unsqueeze(0).float().cuda())
infer_tensor = rst['out'].argmax(1)
topil(infer_tensor.float())


# save_state_dict
torch.save(model.state_dict(),'res_seg_sd.ptd')
new_model_1 = v.models.segmentation.deeplabv3_resnet50(pretrained=False,pretrained_backbone=False,num_classes=2).cuda()
state_dict= torch.load('res_seg_sd.ptd')
new_model_1.load_state_dict(state_dict)
new_model_1.cuda()
new_model_1.eval()
img,label = eval_ds[5]
rst = new_model_1(img.unsqueeze(0).float().cuda())
infer_tensor = rst['out'].argmax(1)
topil(infer_tensor.float())

img,label = eval_ds[6]
topil(img)

# save_model
torch.save(model,'res_seg_model.ptm')

new_model = torch.load('res_seg_model.ptm')

new_model.cuda()
new_model.eval()
img,label = eval_ds[6]
rst = new_model(img.unsqueeze(0).float().cuda())
infer_tensor = rst['out'].argmax(1)
topil(infer_tensor.float())

infer_tensor.squeeze()

from torchvision import transforms as T
import torch


a = [[
    [1,1,4,4],
    [7,0,3,3]
]]
a = torch.IntTensor(a)
a.shape

T.InterpolationMode.BILINEAR

r1 = T.Resize((10,10),interpolation=T.InterpolationMode.NEAREST)
r2 = T.Resize((10,10),interpolation=T.InterpolationMode.BILINEAR)

r1(a)
r2(a)


# 多标签分类
a = torch.Tensor([[10,-10,10,-10,10]])
b = torch.Tensor([[1,0,1,0,1]])

loss = torch.nn.BCEWithLogitsLoss()

loss(a,b)

sig  = torch.nn.Sigmoid()
rst = (sig(a)>0.5).int()
dim1,dim2 = torch.where(rst==1)
i = 1
rst[dim1[i],dim2[i]]

xgboost

import xgboost as xgb
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target
print(y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

bst = xgb.train(param, dtrain, num_round)
bst.dump_model('dump.raw.txt')
preds = bst.predict(dtest)


import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])
from sklearn.metrics import precision_score
precision_score(y_test, best_preds, average='macro')

import xgboost as xgb
n_estimators = 50
params = {'n_estimators':n_estimators, 'booster':'gbtree', 'max_depth':5, 'learning_rate':0.05,
          'objective':'reg:squarederror', 'subsample':1, 'colsample_bytree':1}
clf = xgb.XGBRegressor(**params)

数据挖掘

# kmeans
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics


def get_best_cluster_num(X,cluster_nums):

    metric_score = []
    models = []
    for t in cluster_nums:


        kmeans_model = KMeans(n_clusters=t).fit(X)

        score = metrics.silhouette_score(X, kmeans_model.labels_,metric='euclidean')
        models.append(kmeans_model)
        metric_score.append(score)
    best_idx= np.array(metric_score).argmax()
    print(best_idx)
    best_cls = cluster_nums[best_idx]
    return models[best_idx],best_cls


# 异常检测
import pandas as pd
tmpdf = pd.DataFrame({'p':[1]*4+[0]*4+[66]+[0]*5,'q':[3]*4+[0]*4+[66]+[0]*5})
from sklearn.cluster import DBSCAN
DBSCAN(eps=1, min_samples=3).fit_predict(tmpdf)
from sklearn.ensemble import IsolationForest
IsolationForest().fit_predict(tmpdf)
from sklearn.svm import OneClassSVM
OneClassSVM(nu=0.3).fit_predict(tmpdf)
from sklearn.cluster import KMeans
KMeans(n_clusters=2).fit_predict(tmpdf)

from sklearn.cluster import KMeans  
wcss = []
#考察kmeans初始化方法
for i in range(1,11):
    ##(初始化 KMeans)##     
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X3)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('elbow.png')
plt.show()
#print(words[250:300])
kmeans = KMeans(n_clusters = 3, n_init = 20, n_jobs = 1) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
kmeans.fit(X3)
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
weather_df_clus_temp = weather_df[["Tm", "Tx", "Tn", "xm", "ym"]]
weather_df_clus_temp = StandardScaler().fit_transform(weather_df_clus_temp)

db = DBSCAN(eps=0.3, min_samples=10).fit(weather_df_clus_temp)
labels = db.labels_
print (labels[500:560])
weather_df["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))

from sklearn.decomposition import PCA
pca=PCA(n_components=2)###构建一个PCA模型
pca.fit(digits.data)###将digits数据变换到前两个主成分上
digits_pca=pca.transform(digits.data)
colors=["#476A2A","#7851B8","#BD3430","#4A2D4E","#875525","#A83683","#4E656E","#853541","#3A3120","#535D8E"]
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(),digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(),digits_pca[:,1].max())
for i in range(len(digits.data)):###将数据绘制成文本散点
    plt.text(digits_pca[i,0],digits_pca[i,1],str(digits.target[i]),color=colors[digits.target[i]],fontdict={"weight":"bold","size":9})
plt.xlabel("第一主成分")
plt.ylabel("第二主成分")
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42)###使用fit_transform而不是fit,因为TSNE没有transform方法
digits_tsne = tsne.fit_transform(digits.data)###运行时间较久
plt.figure(figsize=(10,10))
plt.xlim(digits_tsne[:,0].min(),digits_tsne[:,0].max()+1)
plt.ylim(digits_tsne[:,1].min(),digits_tsne[:,1].max()+1)
for i in range(len(digits.data)):###将数据绘制成文本散点
    plt.text(digits_tsne[i,0],digits_tsne[i,1],str(digits.target[i]),color=colors[digits.target[i]],fontdict={"weight":"bold","size":9})
plt.xlabel("第一分量")
plt.ylabel("第二分量")
相关文章
|
Java 应用服务中间件 数据库
RocketBot 的使用 | 学习笔记
快速学习 RocketBot 的使用
RocketBot 的使用 | 学习笔记
|
SQL 开发框架 JSON
honeycomb使用|学习笔记
快速学习honeycomb使用
689 0
honeycomb使用|学习笔记
|
存储 C语言 C++
C++学习笔记(day1)
C++学习笔记(day1)
|
数据安全/隐私保护
【学习笔记之我要C】练习
【学习笔记之我要C】练习
72 0
|
数据采集 SQL 消息中间件
第三阶段总结|学习笔记
快速学习第三阶段总结
131 0
第三阶段总结|学习笔记
|
Scala 开发者
ListBuffer 的使用 | 学习笔记
快速学习 ListBuffer 的使用
ListBuffer 的使用 | 学习笔记
|
缓存 Dubbo 应用服务中间件
本地存根|学习笔记
快速学习本地存根
本地存根|学习笔记
|
SQL 分布式计算 资源调度
StreamSQL|学习笔记
快速学习 StreamSQL
StreamSQL|学习笔记
|
云安全 安全 网络安全
总结 | 学习笔记
快速学习总结
104 0
下一篇
无影云桌面