使用tensorboard

通过callback

import tensorflow as tf
import datetime
import sys

mnist=tf.keras.datasets.mnist
(x_train,y_train),(x_test,y_test)=mnist.load_data()
x_train,x_test=x_train/255.0,x_test/255.0
print(x_train.shape,type(x_train))#训练数据个数60000

def create_model():
	return tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape=(28,28)),tf.keras.layers.Dense(512,activation='relu'),tf.keras.layers.Dropout(0.2),tf.keras.layers.Dense(10,activation='softmax')])

model=create_model()
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
log_dir='logs/fit/'+datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorboard_callback=tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=1)#batch默认为32,则iteration总数=60000*5/32=9375
model.fit(x=x_train,y=y_train,epochs=5,validation_data=(x_test,y_test),callbacks=[tensorboard_callback])

运行完后tensorboard --logdir logs/fit观察准确率和损失的变化

通过tf.summary

import tensorflow as tf
print('TensorFlow version:',tf.__version__)

from tensorflow.keras.layers import Dense,Flatten,Conv2D
from tensorflow.keras import Model
import datetime

mnist=tf.keras.datasets.mnist

(x_train,y_train),(x_test,y_test)=mnist.load_data()
x_train,x_test=x_train/255.0,x_test/255.0

# Add a channels dimension
x_train=x_train[...,tf.newaxis].astype('float32')
x_test=x_test[...,tf.newaxis].astype('float32')

train_dataset=tf.data.Dataset.from_tensor_slices((x_train,y_train)).shuffle(x_train.shape[0]).batch(64)
test_dataset=tf.data.Dataset.from_tensor_slices((x_test,y_test)).batch(64)

class MyModel(Model):
	def __init__(self):
		super(MyModel,self).__init__()
		self.conv1=Conv2D(32,3,activation='relu')
		self.flatten=Flatten()
		self.d1=Dense(128,activation='relu')
		self.d2=Dense(10)

	def call(self,x):
		x=self.conv1(x)
		x=self.flatten(x)
		x=self.d1(x)
		return self.d2(x)

# Create an instance of the model
model=MyModel()

loss_object=tf.keras.losses.SparseCategoricalCrossentropy()
optimizer=tf.keras.optimizers.Adam()

train_loss=tf.keras.metrics.Mean(name='train_loss',dtype=tf.float32)
train_accuracy=tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss=tf.keras.metrics.Mean(name='test_loss',dtype=tf.float32)
test_accuracy=tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

@tf.function
def train_step(images,labels):
	with tf.GradientTape() as tape:
		# training=True is only needed if there are layers with different behavior during training versus inference (e.g. Dropout)
		predictions=model(images,training=True)
		loss=loss_object(labels,predictions)
	gradients=tape.gradient(loss,model.trainable_variables)
	optimizer.apply_gradients(zip(gradients,model.trainable_variables))

	train_loss(loss)
	train_accuracy(labels,predictions)

@tf.function
def test_step(images,labels):
	# training=False is noly needed if there are layers with different behavior during training versus inference (e.g. Dropout)
	predictions=model(images,training=False)
	t_loss=loss_object(labels,predictions)
	test_loss(t_loss)
	test_accuracy(labels,predictions)

current_time=datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
train_log_dir='logs/gradient_tape/'+current_time+'/train'
test_log_dir='logs/gradient_tape/'+current_time+'/test'
train_summary_writer=tf.summary.create_file_writer(train_log_dir)
test_summary_writer=tf.summary.create_file_writer(test_log_dir)

EPOCHS=5

for epoch in range(EPOCHS):
	# Reset the metrics at the start of the next epoch
	train_loss.reset_states()
	train_accuracy.reset_states()
	test_loss.reset_states()
	test_accuracy.reset_states()

	for images,labels in train_dataset:
		train_step(images,labels)
	with train_summary_writer.as_default():
		tf.summary.scalar('loss',train_loss.result(),step=epoch)
		tf.summary.scalar('accuracy',train_accuracy.result(),step=epoch)

	for test_images,test_labels in test_dataset:
		test_step(test_images,test_labels)
	with test_summary_writer.as_default():
		tf.summary.scalar('loss',test_loss.result(),step=epoch)
		tf.summary.scalar('accuracy',test_accuracy.result(),step=epoch)

	print(f'Epoch {epoch+1}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result()*100}, Test Loss: {test_loss.result()}, Test Accuracy: {test_accuracy.result()*100}')

，运行完后，tensorboard --logdir logs/gradient_tape/观察损失和准确率随epoch的变化

tf.summary写自定义的学习率的变化情况，

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
from packaging import version 


import tensorflow as tf
from tensorflow import keras

import numpy as np

print('TensorFlow version:', tf.__version__)
assert version.parse(tf.__version__).release[0]>=2,'This notebook requires TensorFlow 2.0 or above'

data_size=10000
train_pct=0.8
train_size=int(data_size*train_pct)
x=np.linspace(-1000,1000,data_size)
np.random.shuffle(x)
y=0.5*x+2+np.random.normal(0,0.5,(data_size,))
print('y\'s shape',y.shape)
x_train,y_train=x[:train_size],y[:train_size]
x_test,y_test=x[train_size:],y[train_size:]

logdir='logs/scalars/'+datetime.now().strftime('%Y%m%d-%H%M%S')
file_writer=tf.summary.create_file_writer(logdir+'/metrics')
file_writer.set_as_default()

def lr_schedule(epoch):
	# Return a custom learning rate that decreases as epochs progress
	learning_rate=0.2
	if epoch>10:
		learning_rate=0.02
	if epoch>20:
		learning_rate=0.01
	if epoch>50:
		learning_rate=0.005
	with file_writer.as_default():
		tf.summary.scalar('learning rate',data=learning_rate,step=epoch)
	return learning_rate

lr_callback=keras.callbacks.LearningRateScheduler(lr_schedule)
tensorboard_callback=keras.callbacks.TensorBoard(log_dir=logdir)

model=keras.models.Sequential([keras.layers.Dense(16,input_dim=1),keras.layers.Dense(1),])

model.compile(loss='mse',optimizer=keras.optimizers.SGD(),)

training_history=model.fit(x_train,y_train,batch_size=1000,epochs=10,validation_data=(x_test,y_test),callbacks=[tensorboard_callback,lr_callback],)

，tensorboard --logdir logs/scalars，点击20221029-1705-metric的run,观察学习率的变化图

导出上段内容里事件日志的数据

from tensorboard.backend.event_processing import event_accumulator
import tensorflow as tf

# 加载日志数据
ea=event_accumulator.EventAccumulator(r'logs/scalars/20221029-171934/metrics/events.out.tfevents.1667035175.noteb.5894.0.v2')
ea.Reload()
print(ea.tensors.Keys())

epoch_loss=ea.tensors.Items('learning rate')
print(len(epoch_loss))
print([(i.step,tf.make_ndarray(i.tensor_proto)) for i in epoch_loss])

ea.Reload()后需要看标签在哪个子菜单下：ea.Tags()^[4]，我试验的时候这些标签都在tensors下面，不像网上一堆的用ea.scalars.Keys()来访问^[3]

from datetime import datetime
import io
import itertools
from packaging import version

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics

print('TensorFlow version: ',tf.__version__)
assert version.parse(tf.__version__).release[0]>=2,'This notebook requires TensorFlow 2.0 or above'

# Download the data. The data is already divided into train and test. The label are integers representing classes.
fashion_mnist=keras.datasets.fashion_mnist
(train_images,train_labels),(test_images,test_labels)=fashion_mnist.load_data()

# Names of the integer classes, i.e., 0->T-shirt/top,1->Trouser, etc.
class_names=['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']

print('Shape: ',train_images[0].shape)
print('Label: ',train_labels[0],'->',class_names[train_labels[0]])

# Reshape the image for the Summary API
img=np.reshape(train_images[0],(-1,28,28,1))

# Clear out any prior log data.

# Sets up a timestamped log directory
logdir='logs/train_data/'+datetime.now().strftime('%Y%m%d-%H%M%S')
# Creates a file writer for the log directory
file_writer=tf.summary.create_file_writer(logdir)

# Using the file writer, log the reshaped image.
with file_writer.as_default():
	images=np.reshape(train_images[:25],(-1,28,28,1))
	tf.summary.image('25 training data examples',images,max_outputs=25,step=0)

logdir='logs/plots/'+datetime.now().strftime('%Y%m%d-%H%M%S')
file_writer=tf.summary.create_file_writer(logdir)

def plot_to_image(figure):
	# Converts the matplotlib plot specified by 'figure' to a PNG image and returns it. The supplied figure is closed and inaccessible after this call.
	# save the plot to a PNG in memory.
	buf=io.BytesIO()
	plt.savefig(buf,format='png')
	# Closing the figure prevents it from being displayed directly inside the notebook
	plt.close(figure)
	buf.seek(0)
	# Convert PNG buffer to TF image
	image=tf.image.decode_png(buf.getvalue(),channels=4)
	# Add the batch dimension
	image=tf.expand_dims(image,0)
	return image

def image_grid():
	# Return a 5x5 grid of the MNIST images as a matplotlib figure
	# Create a figure to contain the plot
	figure=plt.figure(figsize=(10,10))
	for i in range(25):
		# Start next subplot
		plt.subplot(5,5,i+1,title=class_names[train_labels[i]])
		plt.xticks([])
		plt.yticks([])
		plt.grid(False)
		plt.imshow(train_images[i],cmap=plt.cm.binary)
	return figure

# Prepare the plot
figure=image_grid()
# Convert to image and log
with file_writer.as_default():
	tf.summary.image('Training data',plot_to_image(figure),step=0)

model=keras.models.Sequential([keras.layers.Flatten(input_shape=(28,28)),keras.layers.Dense(32,activation='relu'),keras.layers.Dense(10,activation='softmax')])

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

def plot_confusion_matrix(cm,class_names):
	'''
	Returns a matplotlib figure containing the plotted confusion matrix.

	Args:
		cm(array,shape=[n,n]): a confusion matrix of integer classes
		class_names(array,shape=[n]): String names of the integer classes
	'''
	figure=plt.figure(figsize=(8,8))
	plt.imshow(cm,interpolation='nearest',cmap=plt.cm.Blues)
	plt.title('Confusion matrix')
	plt.colorbar()
	tick_marks=np.arange(len(class_names))
	plt.xticks(tick_marks,class_names,rotation=45)
	plt.yticks(tick_marks,class_names)

	# Compute the labels from the normalized confusion matrix
	labels=np.around(cm.astype('float')/cm.sum(axis=1)[:,np.newaxis],decimals=2)

	# Use white text if squares are dark; otherwise black
	threshold=cm.max()/2
	for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
		color="white" if cm[i,j]>threshold else "black"
		plt.text(j,i,labels[i,j],horizontalalignment="center",color=color)
	plt.tight_layout()
	plt.ylabel('True label')
	plt.xlabel('Predicted label')
	return figure

logdir='logs/image/'+datetime.now().strftime('%Y%m%d-%H%M%S')
# Define the basic TensorBoard callback
tensorboard_callback=keras.callbacks.TensorBoard(log_dir=logdir)
file_writer_cm=tf.summary.create_file_writer(logdir+'/cm')
import pdb;pdb.set_trace()

def log_confusion_matrix(epoch,logs):
	#Use the model to predict the values from the validation dataset
	test_pred_raw=model.predict(test_images)
	test_pred=np.argmax(test_pred_raw,axis=1)

	# Calculate the confusion matrix
	cm=sklearn.metrics.confusion_matrix(test_labels,test_pred)
	# Log the confusion matrix as an image summary
	figure=plot_confusion_matrix(cm,class_names=class_names)
	cm_image=plot_to_image(figure)

	# Log the confusion matrix as an image summary
	with file_writer_cm.as_default():
		tf.summary.image('Confusion Matrix',cm_image,step=epoch)

# Define the pre-epoch callback
cm_callback=keras.callbacks.LambdaCallback(on_epoch_end=log_confusion_matrix)

model.fit(train_images,train_labels,epochs=5,callbacks=[tensorboard_callback,cm_callback],validation_data=(test_images,test_labels),)

tensorboard绘制图像，定时的回调运行

from datetime import datetime
from packaging import version

import tensorflow as tf
from tensorflow import keras

print('TensorFlow version: ',tf.__version__)
assert version.parse(tf.__version__).release[0]>=2,'This notebook requires TensorFlow 2.0 or above'

import tensorboard

# The function to be traced
@tf.function
def my_func(x,y):
	# A simple hand-rolled layer.
	return tf.nn.relu(tf.matmul(x,y))

# set up logging
stamp=datetime.now().strftime('%Y%m%d-%H%M%S')
logdir='logs/func/%s'%stamp
writer=tf.summary.create_file_writer(logdir)

# Sample data for your function
x=tf.random.uniform((3,3))
y=tf.random.uniform((3,3))

# Bracket the function call with tf.summary.trace_on() and tf.summary.trace_export().
tf.summary.trace_on(graph=True,profiler=True)
# Call only one tf.function when tracing
z=my_func(x,y)
with writer.as_default():
	tf.summary.trace_export(name='my_func_trace',step=0,profiler_outdir=logdir)

，记录函数转化的tensorflow计算图并可视化，观察结构，概念图和概要，图的节点根据TPU使用的情况、内存和CPU进行划分。还有个问题：coloring by compute time is only enabled if the Runmetadata proto is passed to the FileWriter when a specific session is run，上述代码运行后在tensorboard界面内存和CPU划分按钮不可用。

import tensorflow as tf
from datetime import datetime
import json
from packaging import version
import tempfile

print('TensorFlow version: ',tf.__version__)
assert version.parse(tf.__version__).release[0]>=2,'This notebook requires TensorFlow 2.0 or above.'

my_text="hello world ! //smile"

# Sets up a timestamped log directory
logdir='logs/text_basics/'+datetime.now().strftime('%Y%m%d-%H%M%S')
# Creates a file writer for the log directory
file_writer=tf.summary.create_file_writer(logdir)

# Using the file writer, log the text
with file_writer.as_default():
	tf.summary.text('first_text',my_text,step=0)

# Sets up a second directory to not overwrite the first one
logdir='logs/multiple_texts/'+datetime.now().strftime('%Y%m%d-%H%M%S')
# Creates a file writer for the log directory
file_writer=tf.summary.create_file_writer(logdir)
# Using the file writer, log the text.
with file_writer.as_default():
	with tf.name_scope('name_scope_1'):
		for step in range(20):
			tf.summary.text('a_stream_of_text',f'hello from step {step}',step=step)
			tf.summary.text('another_stream_of_text',f'This can be kept separate {step}',step=step)
	with tf.name_scope('name_scope_2'):
		tf.summary.text('just_from_step_0','This is an important announcement from step 0',step=0)

# Sets up a third timestamped log directory under 'logs'
logdir='logs/markdown/'+datetime.now().strftime('%Y%m%d-%H%M%S')
# Create a file writer for the log directory
file_writer=tf.summary.create_file_writer(logdir)

some_obj_worth_noting={
	'tfds_training_data':{
		'name':'mnist',
		'split':'train',
		'shuffle_files':'true',	
	},
	'keras_optimizer':{
		'name':'Adagrad',
		'learning_rate':'0.001',
		'epsilon':1e-07,
	},
	'hardware':'Cloud TPU',
}

# TODO: Update this example when TensorBoard is released with https://github.com/tensorflow/tensorboard/pull/4585 which supports fenced codeblocks in Markdown.

def pretty_json(hp):
	json_hp=json.dumps(hp,indent=2)
	return ''.join('\t'+line for line in json_hp.splitlines(True))

markdown_text='''
### Markdown Text
TensorBoard supports basic markdown syntax, including:
preformatted code
**bold text**
|and|tables|
|---|---|
|among|others|
'''
with file_writer.as_default():
	tf.summary.text('run_params',pretty_json(some_obj_worth_noting),step=0)
	tf.summary.text('markdown_jubiliee',markdown_text,step=0)

注册文字

import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

fashion_mnist=tf.keras.datasets.fashion_mnist
(x_train,y_train),(x_test,y_test)=fashion_mnist.load_data()
x_train,x_test=x_train/255.0,x_test/255.0

HP_NUM_UNITS=hp.HParam('num_units',hp.Discrete([16,32]))
HP_DROPOUT=hp.HParam('dropout',hp.RealInterval(0.1,0.2))
HP_OPTIMIZER=hp.HParam('optimizer',hp.Discrete(['adam','sgd']))

METRIC_ACCURACY='accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
	hp.hparams_config(hparams=[HP_NUM_UNITS,HP_DROPOUT,HP_OPTIMIZER],metrics=[hp.Metric(METRIC_ACCURACY,display_name='Accuracy')],)

def train_test_model(hparams):
	model=tf.keras.models.Sequential([
		tf.keras.layers.Flatten(),tf.keras.layers.Dense(hparams[HP_NUM_UNITS],activation=tf.nn.relu),tf.keras.layers.Dropout(hparams[HP_DROPOUT]),tf.keras.layers.Dense(10,activation=tf.nn.softmax),])
	model.compile(optimizer=hparams[HP_OPTIMIZER],loss='sparse_categorical_crossentropy',metrics=['accuracy'],)
	model.fit(x_train,y_train,epochs=1)# Run with 1 epoch to speed things up for demo purposes
	_,accuracy=model.evaluate(x_test,y_test)
	return accuracy

def run(run_dir,hparams):
	with tf.summary.create_file_writer(run_dir).as_default():
		hp.hparams(hparams) # record the values used in this trial
		accuracy=train_test_model(hparams)
		tf.summary.scalar(METRIC_ACCURACY,accuracy,step=1)

session_num=0
for num_units in HP_NUM_UNITS.domain.values:
	for dropout_rate in (HP_DROPOUT.domain.min_value,HP_DROPOUT.domain.max_value):
		for optimizer in HP_OPTIMIZER.domain.values:
			hparams={HP_NUM_UNITS:num_units,HP_DROPOUT:dropout_rate,HP_OPTIMIZER:optimizer,}
			run_name='run-%d'%session_num
			print('---Starting trial: %s'%run_name)
			print({h.name:hparams[h] for h in hparams})
			run('logs/hparam_tuning/'+run_name,hparams)
			session_num+=1

研究超参数对模型准确率的影响

参考链接：[1].Get started with TensorBoard
[2].TensorBoard Scalars: Logging training metrics in Keras
[3].读取并导出Tensorboard中数据
 mlflow/tensorflow/tb.py
[5].Tensorflow - Tensorboard Event Accumulator get Tensor from TensorEvent
[6].在 TensorBoard 中显示图像数据
 [7].检查 TensorFlow 图
 [8].在 TensorBoard 中显示文本数据
 使用 HParams 信息中心进行超参数调节

本文创建于2022.10.29/14.51，修改于2022.10.31/22.31