/*
 * 5799-WZQ (C) COPYRIGHT = NONE
 * LICENSED MATERIALS - PROPERTY OF IBM
 */
/* $Header:rdwr.c 12.0$ */
/* $ACIS:rdwr.c 12.0$ */
/* $Source: /ibm/acis/usr/src/ibm/rvd/server/RCS/rdwr.c,v $ */

#ifndef lint
static char *rcsid = "$Header:rdwr.c 12.0$";
#endif


#ifndef lint
static char rcsid_rdwr_c[] = "$Header:rdwr.c 12.0$";
#endif lint

/* Copyright 1984 by the Massachusetts Institute of Technology */
/* See permission and disclaimer notice in the file "notice.h" */
#include "notice.h"

/* This file contains the routines which actually read and write blocks
 * of data to and from the disk.  The routines in this file are designed
 * to minimize the number of system calls performed, by compacting
 * contiguous requests and performing contiguous reads and writes as
 * single disk transfers.
 */

#include	<sys/types.h>
#include	<stdio.h>
#include	<sys/param.h>
#include	<netinet/in.h>
#include	<netinet/in_systm.h>
#include	<netinet/ip.h>
#include	<netinet/rvd.h>
#include	<machineio/vdconst.h>

#include	"rvd_types.h"
#include	"rvdadd.h"
#include	"custom.h"
#include	"logging.h"
#include	"obj.h"
#include	"queue.h"
#include	"packet.h"
#include	"physd.h"
#include	"virtd.h"
#include	"conn.h"
#include	"extern.h"


/* This macro computes the number of disk blocks of data in a write packet.  It
 * assumes that the packet has a multiple of RVDDSIZE bytes of data in it
 * (this is checked in pkt_check()).
 */

#define	write_blocks(pkt)	(((pkt)->rp_len - sizeof(struct rvdw) \
+ RVDDSIZE) / RVDDSIZE)

struct	rw_stats {			/* read/write statistics */
	int	rs_rck;			/* bad read requests */
	int	rs_rerr;		/* read errors */
	int	rs_rblock;		/* total number of blocks read */
	int	rs_reads;		/* total number of read calls done */
	int	rs_wck;			/* bad write requests */
	int	rs_werr;		/* write errors */
	int	rs_wblock;		/* total number of blocks written */
	int	rs_writes;		/* total number of write calls done */
	int	rs_serr;		/* seek errors */
	int	rs_dupl;		/* discarded duplicate requests */
} rw_stats;


/* Write as many contiguous blocks as possible from the front of the specified
 * connection's write queue.  This is done by copying the data from each packet
 * buffer into a large write buffer and then performing a single write.  Then,
 * send the first packet in the request as a write ack, possibly containing an
 * error status code.
 */

do_write(cn)

register struct	conn	*cn;		/* connection wanting to write */
{
	register struct	rvd_pkt	*pkt;	/* temp packet pointer */
	register char	*bufp;		/* pointer into big buffer */
	register u_long	status;		/* write status */
	register u_long	count;		/* blocks being written */
	u_long	nextblock;		/* temp to find contiguous blocks */
	u_long	offset;			/* seek offset into file */
	u_long	size;			/* temp for # of blocks in packet */
	int	length;			/* length of write request in bytes */
	u_long	nonce;			/* unique id for this request */
	struct	rvd_pkt	*first;		/* first packet in request */

	for (pkt = cn->cn_writes.pq_forw, length = 0, count = 0, bufp = bigbuf,
	     nextblock = 0, nonce = 0;

	     (pkt != (struct rvd_pkt *)&(cn->cn_writes.pq_forw)) &&
	     (count + write_blocks(pkt) <= MAXCONTIG) &&
	     (count == 0 || (pkt->rp_rvd.pkt_dpnd.write.blockn == nextblock &&
	      pkt->rp_rvd.nonce == nonce));

	     pkt = cn->cn_writes.pq_forw) {

		rem_q_elem(pkt);
		if (--rw_qlength < 0)
			rw_qlength = 0;
		size = write_blocks(pkt);

		bcopy((char *)&pkt->rp_rvd.pkt_dpnd.write.data[0], bufp,
		    (int)(size * RVDDSIZE));	/* copy data to buffer */

		if (count == 0) {
			nextblock = pkt->rp_rvd.pkt_dpnd.write.blockn + size;
			nonce = pkt->rp_rvd.nonce;
			first = pkt;		/* save first pkt for reply */
		} else {
			nextblock += size;
			pkt_free(pkt);		/* just free others */
		}

		count += size;
		length += (size * RVDDSIZE);
		bufp += (size * RVDDSIZE);
	}

	offset = first->rp_rvd.pkt_dpnd.write.blockn;
	status = 0;				/* ie success */

	if (((status = write_check(cn, offset, count)) & RVDSTVAL) != 0) {

		if (loglevel(LOG_CLIENT_ERROR)) {
			syslog(LOG_INFO, 
			  "do_write: drive %D pack %s bad write request (%s)",
			  cn->cn_drive, cn->cn_virtd->vd_pack,
			  inet_ntoa(cn->cn_fhost));
			syslog(LOG_INFO,
			  "block %D, count %D status %#X",
			  offset, count, status);
		}
		rw_stats.rs_wck++;
/*count = 0;			/* show nothing written */

	} else if (lseek(cn->cn_virtd->pd_forw->pd_fd,
	    (off_t)((cn->cn_virtd->vd_offset + offset) * RVDDSIZE), 0) < 0) {

		rw_stats.rs_serr++;
		status = RVDSTADR | RVDSTVAL;
		if (loglevel(LOG_ERROR)) {
			syslog(LOG_ERR,
			  "do_write: drive %D pack %s seek error (%s)",
			  cn->cn_drive, cn->cn_virtd->vd_pack,
			  inet_ntoa(cn->cn_fhost));
			syslog(LOG_ERR,
			  "          block %D, count %D errno %D",
			  offset, count, errno);
		}
		count = 0;

	} else if (write(cn->cn_virtd->pd_forw->pd_fd, bigbuf, length) !=
	    length) {

		rw_stats.rs_werr++;
		status = RVDSTVAL;
		if (loglevel(LOG_ERROR)) {
			syslog(LOG_ERR,
			  "do_write: drive %D pack %s write error (%s)",
			  cn->cn_drive, cn->cn_virtd->vd_pack,
			  inet_ntoa(cn->cn_fhost));
			syslog(LOG_ERR,
			  "          block %D, count %D errno %D",
			  offset, count, errno);
		}
		count = 0;
	}

	/*
	 * timestamp request being serviced
	 */
	cn->cn_virtd->vd_accessed = cn->cn_virtd->vd_modified = now;

	rw_stats.rs_writes++;
	rw_stats.rs_wblock += count;
	if (loglevel(LOG_RDWR)) {
		syslog(LOG_INFO, 
		  "do_write: drive %D pack %s write request done (%s)",
		  cn->cn_drive, cn->cn_virtd->vd_pack, inet_ntoa(cn->cn_fhost));
		syslog(LOG_INFO,
		  "          block %D, count %D status %#X",
		  offset, count, status);
	}

	rvd_wack(first, status, count);
}


/* Perform the first read request on the specified connection's read queue.
 * There is guaranteed to be at least one request there.  Perform a single
 * read for all the specified blocks into a large buffer, then construct
 * block packets in the buffer and send them to the network.
 *
 */

do_read(cn)

register struct	conn	*cn;			/* connection to read on */
{
	register struct	rvd_pkt	*pkt;		/* read request packet */
	register char	*bufp;			/* buffer address pointer */
	register u_long	count;			/* total blocks to read */
	register u_long	blocks;			/* temp counter */
	register struct	rvd_pkt	*first;		/* first read request packet */
	u_long	nextblock;			/* temp to find contig. blks */
	u_long	nonce;				/* uid for this request */
	int	size;				/* temp for packet size */
	u_long	status;				/* read status */
	u_long	offset;				/* offset into virt. disk */
	u_long  error;				/* set to RVDSTVAL on error */
	int	length;				/* total bytes read */

	for (pkt = cn->cn_reads.pq_forw, count = 0, nextblock = 0, nonce = 0;

	     (pkt != (struct rvd_pkt *)&(cn->cn_reads.pq_forw)) &&
	     (count + pkt->rp_rvd.pkt_dpnd.read.blockc <= MAXCONTIG) &&
	     (count == 0 || (pkt->rp_rvd.pkt_dpnd.read.blockn == nextblock &&
	     pkt->rp_rvd.nonce == nonce));

	    pkt = cn->cn_reads.pq_forw) {

		rem_q_elem(pkt);
		if (--rw_qlength < 0)
			rw_qlength = 0;
		size = pkt->rp_rvd.pkt_dpnd.read.blockc;
		nextblock = pkt->rp_rvd.pkt_dpnd.read.blockn + size;

		if (count == 0) {
			first = pkt;		/* save first pkt for reply */
			nonce = pkt->rp_rvd.nonce;
		} else
			pkt_free(pkt);

		count += size;
	}

	offset = first->rp_rvd.pkt_dpnd.read.blockn;
	length = count * RVDDSIZE;

	if (((status = read_check(cn, offset, count)) & RVDSTVAL) != 0) {
		rw_stats.rs_rck++;
		if (loglevel(LOG_CLIENT_ERROR)) {
			syslog(LOG_INFO, 
			  "do_read: drive %D pack %s bad read request (%s)",
			  cn->cn_drive, cn->cn_virtd->vd_pack,
			  inet_ntoa(cn->cn_fhost));
			if (status & RVDSTDIS) {
				syslog(LOG_INFO, 
				  "         attempt to read disabled device.");
			} else {
				syslog(LOG_INFO,
				  "         block %D, count %D status %#X",
				  offset, count, status);
			}
		}
		rvd_block(first, status);
		return;
	}

	if (lseek(cn->cn_virtd->pd_forw->pd_fd,
	    (off_t)((cn->cn_virtd->vd_offset + offset) * RVDDSIZE), 0) < 0) {
		rw_stats.rs_serr++;
		if (loglevel(LOG_ERROR)) {
			syslog(LOG_ERR,
			  "do_read: drive %D pack %s seek error (%s)",
			  cn->cn_drive, cn->cn_virtd->vd_pack,
			  inet_ntoa(cn->cn_fhost));
			syslog(LOG_ERR,
			  "         block %D, count %D errno %D",
			  offset, count, errno);
		}
		rvd_block(first, (u_long)(RVDSTADR | RVDSTVAL));
		return;
	}

	error = 0;		/* No error yet */
	if (read(cn->cn_virtd->pd_forw->pd_fd, bigbuf + BLOCKOFF, length) !=
	    length) {				/* read it all in at once */
		rw_stats.rs_rerr++;
		if (loglevel(LOG_ERROR)) {
			syslog(LOG_ERR,
			  "do_read: drive %D pack %s read error (%s)",
			  cn->cn_drive, cn->cn_virtd->vd_pack,
			  inet_ntoa(cn->cn_fhost));
			syslog(LOG_ERR,
			  "         block %D count %D errno %D",
			  offset, count, errno);
		}
		error = RVDSTVAL;
		bzero (bigbuf + BLOCKOFF, length);
/*		rvd_block(first, (u_long)RVDSTVAL); */
/*		return;                             */
	}

	for (bufp = bigbuf + BLOCKOFF, blocks = 0; blocks < count; ) {

		size = MIN(count - blocks, cn->cn_rbfactor);
		pkt = (struct rvd_pkt *)(bufp - BLOCKOFF);
		block_complete(pkt, first, blocks, size * RVDDSIZE);
		rvd_block(pkt, error);
		blocks += size;
		bufp += (size * RVDDSIZE);
	}

	/*
	 * timestamp requested being serviced
	 */
	cn->cn_virtd->vd_accessed = now;

	rw_stats.rs_reads++;
	rw_stats.rs_rblock += count;
	if (loglevel(LOG_RDWR)) {
		syslog(LOG_INFO, 
		  "do_read: drive %D pack %s read request done (%s)",
		  cn->cn_drive, cn->cn_virtd->vd_pack,
		  inet_ntoa(cn->cn_fhost));
		syslog(LOG_INFO,
		  "         block %D count %D status %#X",
		  offset, count, status);
	}
	pkt_free(first);
}
	

/* Successful iff a write request on the specified connection for count blocks
 * starting at block number offset is valid:
 *	disk must be open in shared or exclusive mode
 *	physical disk must be enabled.  (See disuse_physical and use_physical.)
 *	starting block must be positive
 *	ending block must be within disk
 * If all these are ok, returns 0; otherwise returns a status code or'ed with
 * RVDSTVAL.
 */

u_long
write_check(cn, offset, count)

register struct	conn	*cn;		/* connection to write on */
register u_long	offset;			/* starting block number */
register u_long	count;			/* number of blocks to write */
{
#ifdef	TEST_SERVER
	int	error;

	if((error = test_get_write_err(cn->cn_fhost))) {
		return(error);
	}
#endif	TEST_SERVER

	if (cn->cn_virtd->vd_mode != RVDMEXC &&
	    cn->cn_virtd->vd_mode != RVDMSHR)
		return(RVDSTWRL | RVDSTVAL);

	if ((cn->cn_virtd->pd_forw->pd_mode & RVD_USE_PHYS) == NULL)
		return(RVDSTDIS | RVDSTVAL);

	if (offset + count > cn->cn_virtd->vd_blocks) 
		return(RVDSTADR | RVDSTVAL);

	return(0);
}


/* Successful iff the specified read request is valid:
 *	physical disk must be enabled.  (See disuse_physical and use_physical.)
 *	starting block must be positive
 *	ending block must be within disk
 *	count must be less than or equal to maximum allowed burst size
 */

u_long
read_check(cn, offset, count)

register struct	conn	*cn;		/* connection to read on */
register u_long	offset;			/* starting block number of read */
register u_long	count;			/* number of blocks to read */
{
#ifdef	TEST_SERVER
	int	error;

	if((error = test_get_read_err(cn->cn_fhost))) {
		return(error);
	}
#endif	TEST_SERVER

	if ((cn->cn_virtd->pd_forw->pd_mode & RVD_USE_PHYS) == NULL)
		return(RVDSTDIS | RVDSTVAL);

	if (count > MAXCONTIG || (offset + count) > cn->cn_virtd->vd_blocks)
		return(RVDSTADR | RVDSTVAL);
	return(0);
}


/* Insert the specified packet into the queue, sorted by block number.
 * Duplicate checking is done, in that if the packet exactly duplicates
 * one already on the list (same block number and same nonce), it is
 * freed instead of being enqueued.  If the block number matches another
 * block on the list, but the nonces don't match, the packet is enqueued
 * after the matching block on the list.  This will cause the same disk
 * block to be read or written twice, but it's too hard to deal with
 * compacting requests of different lengths.
 * This routine depends on the fact that the block number field in read
 * and write packets is in the same location in the packet - otherwise
 * separate sorting routines would be needed for read and write packets.
 */

sorted_insque(pkt, queue)

register struct	rvd_pkt	*pkt;		/* packet to enqueue */
register struct	pkt_q	*queue;		/* queue to use */
{
	register struct	rvd_pkt	*prev;	/* previous packet in queue */
	register u_long	blockn;		/* this packet's block number */
	register struct	rvd_pkt	*tmp;	/* temp for duplicate checks */

	blockn = pkt->rp_rvd.pkt_dpnd.read.blockn;
	for (prev = queue->pq_back; prev != (struct rvd_pkt *)queue;
	     prev = prev->rp_back) {

		if (prev->rp_rvd.pkt_dpnd.read.blockn == blockn) {
			for (tmp = prev; tmp != (struct rvd_pkt *)queue &&
			     tmp->rp_rvd.pkt_dpnd.read.blockn == blockn;
			     tmp = tmp->rp_back) { /* check for duplicates */
				if (tmp->rp_rvd.nonce == pkt->rp_rvd.nonce) {
					rw_stats.rs_dupl++;
					pkt_free(pkt);	/* duplicate, toss */
					return;
				}
			}
			ins_q_after(pkt, prev);	/* not a duplicate */
			rw_qlength++;
			return;
		} else if (prev->rp_rvd.pkt_dpnd.read.blockn < blockn) {

			ins_q_after(pkt, prev);
			rw_qlength++;
			return;
		}
	}

	ins_q_head(pkt, queue);			/* must be first */
	rw_qlength++;
}


/* Show read/write statistics in log.
 */

rw_show()

{
	syslog(LOG_INFO, "Disk read/write statistics:");
	syslog(LOG_INFO, " %8d    bad read requests", rw_stats.rs_rck);
	syslog(LOG_INFO, " %8d    read errors", rw_stats.rs_rerr);
	syslog(LOG_INFO, " %8d    blocks read", rw_stats.rs_rblock);
	syslog(LOG_INFO, " %8d    read calls done", rw_stats.rs_reads);
	syslog(LOG_INFO, " %8d    bad write requests", rw_stats.rs_wck);
	syslog(LOG_INFO, " %8d    write errors", rw_stats.rs_werr);
	syslog(LOG_INFO, " %8d    blocks written", rw_stats.rs_wblock);
	syslog(LOG_INFO, " %8d    write calls done", rw_stats.rs_writes);
	syslog(LOG_INFO, " %8d    seek errors", rw_stats.rs_serr);
	syslog(LOG_INFO, " %8d    duplicate requests\n", rw_stats.rs_dupl);
}

/*
 * rw_qlen_show(): Show read/write queue length statistics.
 *
 * The queue lengths are recorded as a histogram in rw_qsize.  Each element,
 * "n", of the rw_qsize array represents a queue length size, "s".  The
 * value of rw_qsize[n] is a count of the number of times when:
 *
 *		s <= queue length < s(rw_qsize[n+1])
 *
 * The relationship between "s" and the array index is: if the array index is
 * thought of as a two digit decimal number then the first digit is the
 * power of ten of the sample and the second digit is the mantissa.  For
 * example, if n = 23 then rw_qsize[23] is a count of all queue lengths
 * where s(23) = 3*(10**2) = 300 <= queue length < 400 = 4*(10**2) = s(24).
 *
 * The rw_qlength variable (not used here) is used to measure current queue
 * length.  It is incremented in sorted_insque when the packets are queued
 * and decremented in do_read and do_write when the packets are dequeued.
 *
 * The histogram is printed out as a graph where the number of '*'s on
 * the graph line are (log2(rw_qsize[n])).  (Note: no star will be printed 
 * if (rw_qsize[n] == 1).  Instead there will just be the line header.)
 */
#define MAX_QSTRING 80
#define MAX_HSTRING 11
char	rw_qstring[MAX_QSTRING];
char	hdr_string[MAX_HSTRING];

rw_qlen_show()
{
	int	i, j, k, l, sample_size;
	char	*cp, *str_end;

	syslog(LOG_INFO, "Disk read/write queue length statistics:");
	syslog(LOG_INFO, "maximum qlen: %d", max_qlength);
	syslog(LOG_INFO, "     qlen  samples    log2(samples)");
	for (i = 0; i < MAX_RW_QARRAY; i++) {

		if (rw_qsize[i] == 0)
			continue;

		/*
		 * Compute the sample size from the index.
		 * At the end of the loop:
		 * 	l = power of 10.
		 *	j = 10**(l)
		 */
		j = 1;
		l = 0;
		for (k = i/10; k != 0; k--) {
			j *= 10;
			l++;
		}
		sample_size = j*(i-(10*l));


		/*
		 * Print the line header.  It consists of:
		 *	           s(n): (number of samples)	(n < 10)
		 *	s(n)-[s(n+1)-1]: (number of samples)	(n >= 10)
		 * Spaces are printed in loops because syslog doesn't do tabs.
		 */
		if (sample_size < 10) {
			sprintf(rw_qstring, "%9d: (%d)", 
				sample_size, rw_qsize[i]);
		} else {
			sprintf(hdr_string, "%d-%d: ", 
				sample_size, sample_size + (j - 1));
			str_end = rw_qstring + (MAX_HSTRING-strlen(hdr_string));
			for (cp = rw_qstring; cp < str_end; cp++) {
				sprintf(cp, " ");
			}
			strcpy(cp, hdr_string); 
			cp += strlen(cp);
			sprintf(cp, "(%d)", rw_qsize[i]);
		}
		str_end = rw_qstring + 22;
		for (cp = rw_qstring + strlen(rw_qstring); cp < str_end; cp++) {
			sprintf(cp, " ");
		}


		/*
		 * Graph is log2(number of samples) stars.
		 */
		if (rw_qsize[i] == 1) {
			sprintf(cp, "*\n");
		} else {
			for (j = 2; j != 0 && (j <= rw_qsize[i]) && 
			     cp < rw_qstring + MAX_QSTRING - 2; j *= 2, cp++) {
				sprintf(cp, "*");
			}
			sprintf(cp, "\n");
		}
		syslog(LOG_INFO, "%s", rw_qstring);
	}
}
