1 /*
2  * Copyright (C) 2015-2017, by Laszlo Szeremi under the Boost license.
3  *
4  * Pixel Perfect Engine, graphics.layers module
5  */
6 module PixelPerfectEngine.graphics.layers;
7 
8 public import PixelPerfectEngine.graphics.bitmap;
9 public import PixelPerfectEngine.graphics.common;
10 import std.conv;
11 import std.stdio;
12 import std.parallelism;
13 //import system.etc;
14 import PixelPerfectEngine.system.exc;
15 import std.algorithm;
16 import derelict.sdl2.sdl;
17 //import std.range;
18 
19 
20 //Used mainly to return both the color ID and the transparency at the same time to reduce CPU time.
21 /*public struct PixelData {
22  public bool alpha;
23  public ushort color;
24  this(bool a, ushort c){
25  alpha = a;
26  color = c;
27  }
28  }*/
29 
30 static immutable ushort[4] alphaMMXmul_const256 = [256,256,256,256];
31 static immutable ushort[4] alphaMMXmul_const1 = [1,1,1,1];
32 static immutable ushort[8] alphaSSEConst256 = [256,256,256,256,256,256,256,256];
33 static immutable ushort[8] alphaSSEConst1 = [1,1,1,1,1,1,1,1];
34 static immutable uint[4] SSEUQWmaxvalue = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF] ;
35 
36 //static immutable uint[2] alphaMMXmul_0 = [1,1];
37 
38 public enum FlipRegister : ubyte {
39 	NORM	=	0x00,
40 	X		=	0x01,
41 	Y		=	0x02,
42 	XY		=	0x03
43 }
44 
45 /*public interface ILayer{
46 	// Returns color.
47 	//public ushort getPixel(ushort x, ushort y);
48 	// Returns if the said pixel's color is equals with the transparent color index.
49 	//public bool isTransparent(ushort x, ushort y);
50 	// Returns the PixelData.
51 	//public PixelData getPixelData(ushort x, ushort y);
52 	
53 	public void setRasterizer(int rX, int rY);
54 	public void updateRaster(Bitmap16Bit frameBuffer);
55 	public void updateRaster(void* workpad, int pitch, ubyte[] palette);
56 }*/
57 
58 abstract class Layer {
59 	
60 	
61 	// scrolling position
62 	private int sX, sY, rasterX, rasterY;
63 	//Deprecated
64 	//private ushort transparencyIndex;
65 	//Deprecated. Set color 0 as transparent instead
66 	/*public void setTransparencyIndex(ushort color){
67 		transparencyIndex = color;
68 	}*/
69 	
70 	public void setRasterizer(int rX, int rY){
71 		//frameBuffer = frameBufferP;
72 		rasterX=rX;
73 		rasterY=rY;
74 		
75 	}
76 	
77 	//Absolute scrolling.
78 	public void scroll(int x, int y){
79 		sX=x;
80 		sY=y;
81 	}
82 	//Relative scrolling. Positive values scrolls the layer left and up, negative values scrolls the layer down and right.
83 	public void relScroll(int x, int y){
84 		sX=sX+x;
85 		sY=sY+y;
86 	}
87 	//Getters for the scroll positions.
88 	public int getSX(){
89 		return sX;
90 	}
91 	public int getSY(){
92 		return sY;
93 	}
94 	/// Override this to enable output to the raster
95 	public abstract void updateRaster(void* workpad, int pitch, ubyte[] palette, int[] threads);
96 	
97 
98 }
99 
100 public struct BLInfo{
101 	public int tileX, tileY, mX, mY;
102 	this(int tileX1,int tileY1,int x1,int y1){
103 		tileX = tileX1;
104 		tileY = tileY1;
105 		mX = x1;
106 		mY = y1;
107 	}
108 }
109 /**
110  * Sets the rendering mode of the TileLayer.
111  * 
112  * COPY is the fastest, but overrides any kind of transparency keying. It directly writes into the framebuffer. Should only be used for certain applications, like bottom layers.
113  * BLITTER uses a custom BitBlT algorithm for the SSE2 instruction set. Automatically generates the copying mask depending on the alpha-value. Any alpha-value that's non-zero will cause a non-transparent pixel, and all zeros are completely transparent. Gradual transparency in not avaliable.
114  * ALPHA_BLENDING uses SSE2 for alpha blending. The slowest, but allows gradual transparencies.
115  */ 
116 public enum TileLayerRenderingMode{
117 	COPY,
118 	BLITTER,
119 	ALPHA_BLENDING
120 }
121 /*
122  *Used by the background-sprite tester.
123  */
124 public interface ITileLayer{
125 	public BLInfo getLayerInfo();
126 	public Bitmap16Bit getTile(wchar id);
127 	public wchar[] getMapping();
128 }
129 /**
130  * General purpose TileLayer with palette support, mainly for backgrounds.
131  * Use multiple of this class for paralax scrolling.
132  */
133 public class TileLayer : Layer, ITileLayer{
134 	private int tileX, tileY, mX, mY;
135 	private int totalX, totalY;
136 	private wchar[] mapping;
137 	private TileLayerRenderingMode renderMode;
138 	private Bitmap16Bit[wchar] tileSet;
139 	private bool wrapMode; 
140 	///Constructor. tX , tY : Set the size of the tiles on the layer.
141 	this(int tX, int tY, TileLayerRenderingMode renderMode = TileLayerRenderingMode.ALPHA_BLENDING){
142 		tileX=tX;
143 		tileY=tY;
144 		this.renderMode = renderMode;
145 	}
146 	/// Wrapmode: if enabled, the layer will be turned into an "infinite" mode.
147 	public void setWrapMode(bool w){
148 		wrapMode = w;
149 	}
150 	///Gets the the ID of the given element from the mapping. x , y : Position.
151 	public wchar readMapping(int x, int y){
152 		/*if(x<0 || x>totalX/tileX){
153 		 return 0xFFFF;
154 		 }*/
155 		return mapping[x+(mX*y)];
156 	}
157 	///Writes to the map. x , y : Position. w : ID of the tile.
158 	public void writeMapping(int x, int y, wchar w){
159 		mapping[x+(mX*y)]=w;
160 	}
161 	//Loads a mapping from an array. x , y : Sizes of the mapping. map : an array representing the elements of the map.
162 	//x*y=map.length
163 	public void loadMapping(int x, int y, wchar[] map){
164 		mX=x;
165 		mY=y;
166 		mapping = map;
167 		totalX=mX*tileX;
168 		totalY=mY*tileY;
169 	}
170 	//Adds a tile to the tileSet. t : The tile. id : The ID in wchar to differentiate between different tiles.
171 	public void addTile(Bitmap16Bit t, wchar id){
172 		if(t.getX()==tileX && t.getY()==tileY){
173 			tileSet[id]=t;
174 		}
175 		else{
176 			throw new TileFormatException("Incorrect tile size!", __FILE__, __LINE__, null);
177 		}
178 	}
179 	//Removes the tile with the ID from the set.
180 	public void removeTile(wchar id){
181 		tileSet.remove(id);
182 	}
183 
184 	public wchar tileByPixel(int x, int y){
185 		if(x/tileX + (y/tileY)*mX < 0 || x/tileX + (y/tileY)*mX >= mapping.length) return 0xFFFF;
186 		return mapping[x/tileX + (y/tileY)*mX];
187 	}
188 	
189 	public override void updateRaster(void* workpad, int pitch, ubyte[] palette, int[] threads){
190 
191 		if((sX + rasterX <= 0 || sX > totalX) && !wrapMode) return;
192 		switch(renderMode){
193 			case TileLayerRenderingMode.ALPHA_BLENDING:
194 				int y = sY < 0 ? sY * -1 : 0;
195 				//int yBegin = sY < 0 ? sY * -1 : 0;
196 				/*if(wrapMode){
197 				 y = sX + 0x7FFFFFFF;
198 				 }else{
199 				 y = sX < 0 ? 0 : sX;
200 				 }*/
201 				for( ; y < rasterY ; y++){
202 					//writeln(y);
203 					//if((sY + y >= totalY) && !wrapMode) break;
204 					//if(y + sY >= 0){
205 					int offsetP = y*pitch;	// The offset of the line that is being written
206 					int offsetY = tileY * ((y + sY)%tileY);
207 					int offsetX = sX%tileX;
208 					//int outscrollX = sX<0 ? sX*-1 : 0;
209 					//int tnXreg = (sX-(sX%tileX))/tileX;		
210 					//int tnXC = tnXreg + (rasterX/tileX);
211 					//bool finish;
212 					//writeln(offsetY);
213 					//while(!finish){
214 					int x = sX < 0 ? sX * -1 : 0;
215 					int targetX = totalX - sX > rasterX ? rasterX : rasterX - (totalX - sX);
216 					void *p0 = (workpad + (x*4) + offsetP);
217 					while(x < targetX){
218 						//writeln(tnXreg+(mX*((y+sY-((y+sY)%tileY))/tileY)));
219 						//ushort[] chunk = tileSet[mapping[tnXreg+(mX*((y+sY-((y+sY)%tileY))/tileY))]].readRow((y+sY)%tileY);
220 						
221 						//ushort *c = tileSet[mapping[tnXreg+(mX*((y+sY-((y+sY)%tileY))/tileY))]].getPtr();
222 
223 						wchar currentTile = tileByPixel(x+sX,y+sY);
224 						if(currentTile != 0xFFFF){ // skip if tile is null
225 							//writeln(currentTile);
226 							int tileXtarget = x + tileX < rasterX ? tileX : tileX - ((x + tileX) - rasterX);	// 
227 							//if(tileXtarget + x > ){}
228 							int xp = (offsetX != 0 && x == 0) ? offsetX : 0;	// 
229 							ushort *c = tileSet[currentTile].getPtr();	// pointer to the current tile's pixeldata
230 							c += offsetY;
231 							c += xp;
232 							//int foo = (tnXreg*tileX);
233 							for(; xp < tileXtarget-3; xp+=4){
234 
235 								ubyte[16] *p = cast(ubyte[16]*)p0;
236 								//writeln(p,',',x,',',y,',',xp);
237 								ubyte[16] src;
238 								//writeln(*c);
239 								*cast(ubyte[4]*)(src.ptr) = *cast(ubyte[4]*)(palette.ptr + (4 * *c));
240 								c++;
241 								*cast(ubyte[4]*)(src.ptr + 4) = *cast(ubyte[4]*)(palette.ptr + (4 * *c));
242 								c++;
243 								*cast(ubyte[4]*)(src.ptr + 8) = *cast(ubyte[4]*)(palette.ptr + (4 * *c));
244 								c++;
245 								*cast(ubyte[4]*)(src.ptr + 12) = *cast(ubyte[4]*)(palette.ptr + (4 * *c));
246 								c++;
247 								ubyte[16] alpha = [src[0],src[0],src[0],src[0],src[4],src[4],src[4],src[4],src[8],src[8],src[8],src[8],src[12],src[12],src[12],src[12]];
248 								//uint[4] alpha;
249 								//writeln(src);
250 								asm{
251 									//calculating alpha
252 									//pxor	XMM1, XMM1;
253 									movups	XMM0, alpha;
254 									
255 									movups	XMM1, XMM0;
256 									punpcklbw	XMM0, XMM2;
257 									punpckhbw	XMM1, XMM2;
258 									movaps	XMM6, alphaSSEConst256;
259 									movaps	XMM7, XMM6;
260 									movaps	XMM4, alphaSSEConst1;
261 									movaps	XMM5, XMM4;
262 									
263 									
264 									//punpcklbw	XMM1, XMM2;
265 									
266 									paddusw	XMM4, XMM0;	//1 + alpha01
267 									paddusw	XMM5, XMM1;
268 									psubusw	XMM6, XMM0;	//256 - alpha01
269 									psubusw	XMM7, XMM1;
270 									
271 									//moving the values to their destinations
272 									mov		EBX, p[EBP];
273 									movups	XMM0, src;	//src01
274 									movups	XMM1, XMM0; //src23
275 									punpcklbw	XMM0, XMM2;
276 									punpckhbw	XMM1, XMM2;
277 									pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
278 									pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
279 									movups	XMM0, [EBX];	//dest01
280 									movups	XMM1, XMM0;		//dest23
281 									punpcklbw	XMM0, XMM2;
282 									punpckhbw	XMM1, XMM2;
283 									pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
284 									pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
285 									
286 									paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
287 									paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
288 									psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
289 									psrlw	XMM5, 8;
290 									//moving the result to its place;
291 									//pxor	MM2, MM2;
292 									packuswb	XMM4, XMM5;
293 									
294 									movups	[EBX], XMM4;
295 									
296 									//emms;
297 								}
298 								//writeln(*p);
299 								x+=4;
300 								p0+=16;
301 							}
302 							for(; xp < tileXtarget; xp++){
303 								ubyte[4] *p = cast(ubyte[4]*)p0;
304 								ubyte[4] src = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
305 								c++;
306 								ushort[4] alpha = [src[0],src[0],src[0],src[0]];
307 								asm{
308 									pxor	XMM3, XMM3;
309 									movq	XMM2, alpha;
310 									mov		EBX, p[EBP];
311 									movd	XMM0, [EBX];
312 									movd	XMM1, src;
313 									punpcklbw	XMM0, XMM3;//dest
314 									punpcklbw	XMM1, XMM3;//src
315 									//punpcklbw	XMM2, XMM3;//alpha
316 									movaps	XMM4, alphaSSEConst256;
317 									movaps	XMM5, alphaSSEConst1;
318 									
319 									paddusw XMM5, XMM2;//1+alpha
320 									psubusw	XMM4, XMM2;//256-alpha
321 									
322 									pmullw	XMM0, XMM4;//dest*(256-alpha)
323 									pmullw	XMM1, XMM5;//src*(1+alpha)
324 									paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
325 									psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
326 									//pxor	XMM7, XMM7;
327 									packuswb	XMM0, XMM3;
328 									
329 									movd	[EBX], XMM0;
330 									
331 									//pxor	XMM0, XMM0;
332 									//pxor	XMM1, XMM1;
333 									pxor	XMM2, XMM2;
334 								}
335 								x++;
336 								p0+=4;
337 							}
338 							/*ushort c = chunk[x];
339 							 alphaBlend(palette[(c*4)+1],palette[(c*4)+2],palette[(c*4)+3],palette[(c*4)], workpad + ((tnXreg*tileX)+x-sX)*4 + y*pitch);*/
340 							
341 							
342 						}else{
343 							x+=tileX;
344 						}
345 					}
346 					
347 				}break;
348 			case TileLayerRenderingMode.BLITTER:
349 				int y = sY < 0 ? sY * -1 : 0;
350 
351 				for( ; y < rasterY ; y++){
352 
353 					int offsetP = y*pitch;	// The offset of the line that is being written
354 					int offsetY = tileY * ((y + sY)%tileY);
355 					int offsetX = sX%tileX;
356 
357 					int x = sX < 0 ? sX * -1 : 0;
358 					int targetX = totalX - sX > rasterX ? rasterX : rasterX - (totalX - sX);
359 					void *p0 = (workpad + (x*4) + offsetP);
360 					while(x < targetX){
361 
362 						wchar currentTile = tileByPixel(x+sX,y+sY);
363 						if(currentTile != 0xFFFF){
364 							int tileXtarget = x + tileX < rasterX ? tileX : tileX - ((x + tileX) - rasterX);	// 
365 
366 							//int xp;	// 
367 							int xp = (offsetX != 0 && x == 0) ? offsetX : 0;	// 
368 							ushort *c = tileSet[currentTile].getPtr();	// pointer to the current tile's pixeldata
369 							c += offsetY;
370 							c += xp;
371 							//int foo = (tnXreg*tileX);
372 							for(; xp < tileXtarget-3; xp+=4){
373 								
374 								ubyte[16] *p = cast(ubyte[16]*)p0;
375 								ubyte[16] src;
376 								*cast(ubyte[4]*)(src.ptr) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
377 								c++;
378 								*cast(ubyte[4]*)(src.ptr + 4) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
379 								c++;
380 								*cast(ubyte[4]*)(src.ptr + 8) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
381 								c++;
382 								*cast(ubyte[4]*)(src.ptr + 12) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
383 								c++;
384 								ubyte[16] alpha = [src[0],src[0],src[0],src[0],src[4],src[4],src[4],src[4],src[8],src[8],src[8],src[8],src[12],src[12],src[12],src[12]];
385 
386 								asm{
387 									//generating copying mask
388 									pxor	XMM1, XMM1;
389 									movups	XMM0, alpha;
390 									pcmpgtd	XMM0, XMM1;
391 
392 									mov		EBX, p[EBP];
393 									movups	XMM2, src;
394 									movups	XMM3, [EBX];
395 									//the blitter algorithm
396 									pand	XMM3, XMM0;
397 									por		XMM3, XMM2;
398 									//writeback
399 									movups	[EBX], XMM3;
400 
401 								}
402 								x+=4;
403 								p0+=16;
404 							}
405 							for(; xp < tileXtarget; xp++){
406 								ubyte[4] *p = cast(ubyte[4]*)p0;
407 								ubyte[4] src = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
408 								c++;
409 								ubyte[4] alpha = [src[0],src[0],src[0],src[0]];
410 								asm{
411 									//generating copying mask
412 									pxor	XMM1, XMM1;
413 									movd	XMM0, alpha;
414 									pcmpgtd	XMM0, XMM1;
415 									
416 									mov		EBX, p[EBP];
417 									movd	XMM2, src;
418 									movd	XMM3, [EBX];
419 									//the blitter algorithm
420 									pand	XMM3, XMM0;
421 									por		XMM3, XMM2;
422 									//writeback
423 									movd	[EBX], XMM3;
424 
425 								}
426 								x++;
427 								p0+=4;
428 							}
429 
430 						}else{
431 							x+=tileX;
432 						}
433 					}
434 					
435 				}
436 				break;
437 			default:
438 				int y = sY < 0 ? sY * -1 : 0;
439 				
440 				for( ; y < rasterY ; y++){
441 					
442 					int offsetP = y*pitch;	// The offset of the line that is being written
443 					int offsetY = tileY * (y - sY)%tileY;
444 					
445 					int x = sX < 0 ? sX * -1 : 0;
446 					int targetX = totalX - sX > rasterX ? rasterX : rasterX - (totalX - sX);
447 					void *p0 = (workpad + (x*4) + offsetP);
448 					while(x < targetX){
449 						
450 						wchar currentTile = tileByPixel(x+sX,y+sY);
451 						if(currentTile != 0x0000){
452 							int tileXtarget = x + tileX < rasterX ? tileX : tileX - ((x + tileX) - rasterX);	// 
453 							
454 							int xp;	// 
455 							ushort *c = tileSet[currentTile].getPtr();	// pointer to the current tile's pixeldata
456 							c += offsetY;
457 							//int foo = (tnXreg*tileX);
458 							for(; xp < tileXtarget-3; xp+=4){
459 								
460 								ubyte[16] *p = cast(ubyte[16]*)p0;
461 								ubyte[16] src;
462 								*cast(ubyte[4]*)(src.ptr) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
463 								c++;
464 								*cast(ubyte[4]*)(src.ptr + 4) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
465 								c++;
466 								*cast(ubyte[4]*)(src.ptr + 8) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
467 								c++;
468 								*cast(ubyte[4]*)(src.ptr + 12) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
469 								c++;
470 								//ubyte[16] alpha = [src[0],src[0],src[0],src[0],src[4],src[4],src[4],src[4],src[8],src[8],src[8],src[8],src[12],src[12],src[12],src[12]];
471 								
472 								asm{
473 
474 									mov		EBX, p[EBP];
475 									movups	XMM2, src;
476 									//writeback
477 									movups	[EBX], XMM2;
478 									
479 								}
480 								x+=4;
481 								p0+=16;
482 							}
483 							for(; xp < tileXtarget; xp++){
484 								ubyte[4] *p = cast(ubyte[4]*)p0;
485 								ubyte[4] src = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
486 
487 								c++;
488 								//ubyte[4] alpha = [src[0],src[0],src[0],src[0]];
489 								asm{
490 
491 									mov		EBX, p[EBP];
492 									movd	XMM2, src;
493 									//writeback
494 									movd	[EBX], XMM2;
495 									
496 								}
497 								x++;
498 								p0+=4;
499 							}
500 							
501 						}else{
502 							x+=tileX;
503 						}
504 					}
505 					
506 				}
507 				break;
508 		}
509 	}
510 	
511 	public void updateRaster(Bitmap16Bit frameBuffer){
512 		if(sX + rasterX <= 0 || sX > totalX) return;
513 		for(int y ; y < rasterY ; y++){
514 			if(sY + y >= totalY) break;
515 			if(y + sY >= 0){
516 				
517 				//int outscrollX = sX<0 ? sX*-1 : 0;
518 				int tnXreg = sX>0 ? (sX-(sX%tileX))/tileX : 0;
519 				//int tnXC = tnXreg + (rasterX/tileX);
520 				bool finish;
521 				while(!finish){
522 					//writeln(tnXreg+(mX*((y+sY-((y+sY)%tileY))/tileY)));
523 					ushort[] chunk = tileSet[mapping[tnXreg+(mX*((y+sY-((y+sY)%tileY))/tileY))]].readRow((y+sY)%tileY);
524 					for(int x; x <tileX; x++){
525 						
526 						if((tnXreg*tileX)+x-sX >= 0 && (tnXreg*tileX)+x-sX < rasterX){
527 							frameBuffer.writePixel((tnXreg*tileX)+x-sX,y,chunk[x]);
528 						}else if((tnXreg*tileX)+x-sX >= rasterX){
529 							finish = true;
530 						}
531 					}
532 					tnXreg++;
533 					if(tnXreg == mX){ finish = true;}
534 				}
535 			}
536 		}
537 		
538 	}
539 	
540 	public BLInfo getLayerInfo(){
541 		return BLInfo(tileX,tileY,mX,mY);
542 	}
543 	public Bitmap16Bit getTile(wchar id){
544 		return tileSet[id];
545 	}
546 	public wchar[] getMapping(){
547 		return mapping;
548 	}
549 }
550 /*
551  *Used by the collision detectors
552  */
553 public interface ISpriteCollision{
554 	//public Bitmap16Bit[int] getSpriteSet();
555 	public Coordinate[int] getCoordinates();
556 	public FlipRegister[int] getFlipRegisters();
557 	public int[int] getSpriteSorter();
558 	//public ushort getTransparencyIndex();
559 }
560 
561 public interface ISpriteLayer{
562 	//public void addSprite(Bitmap16Bit s, int n, Coordinate c);
563 	//public void addSprite(Bitmap16Bit s, int n, int x, int y);
564 	public void removeSprite(int n);
565 	public void moveSprite(int n, int x, int y);
566 	public void relMoveSprite(int n, int x, int y);
567 }
568 public interface ISpriteLayer16Bit : ISpriteLayer{
569 	public void addSprite(Bitmap16Bit s, int n, Coordinate c);
570 	public void addSprite(Bitmap16Bit s, int n, int x, int y);
571 	public void replaceSprite(Bitmap16Bit s, int n);
572 	public void replaceSprite(Bitmap16Bit s, int n, int x, int y);
573 	public void replaceSprite(Bitmap16Bit s, int n, Coordinate c);
574 }
575 public interface ISpriteLayer32Bit : ISpriteLayer{
576 	public void addSprite(Bitmap32Bit s, int n, Coordinate c);
577 	public void addSprite(Bitmap32Bit s, int n, int x, int y);
578 	public void replaceSprite(Bitmap32Bit s, int n);
579 	public void replaceSprite(Bitmap32Bit s, int n, int x, int y);
580 	public void replaceSprite(Bitmap32Bit s, int n, Coordinate c);
581 }
582 /*
583  *Use it to call the collision detector
584  */
585 public interface SpriteMovementListener{
586 	void spriteMoved(int ID);
587 }
588 /**
589  *Sprite controller and renderer.
590  */
591 public class SpriteLayer : Layer, ISpriteCollision, ISpriteLayer16Bit{
592 	private Bitmap16Bit[int] spriteSet;
593 	private Coordinate[int] coordinates;		//Use moveSprite() and relMoveSprite() instead to move sprites
594 	private FlipRegister[int] flipRegisters;
595 	private int[] spriteSorter;
596 	public SpriteMovementListener[int] collisionDetector;
597 	//Constructors. 
598 	/*public this(int n){
599 	 spriteSet.length = n;
600 	 coordinates.length = n;
601 	 flipRegisters.length = n;
602 	 }*/
603 	
604 	public this(){
605 		
606 	}
607 	
608 	public void addSprite(Bitmap16Bit s, int n, Coordinate c){
609 		spriteSet[n] = s;
610 		coordinates[n] = c;
611 		flipRegisters[n] = FlipRegister.NORM;
612 		spriteSorter ~= n;
613 		//sortSprites();
614 		spriteSorter.sort();
615 		
616 	}
617 	
618 	public void addSprite(Bitmap16Bit s, int n, int x, int y){
619 		spriteSet[n] = s;
620 		coordinates[n] = Coordinate(x,y,x+s.getX(),y+s.getY());
621 		flipRegisters[n] = FlipRegister.NORM;
622 		//spriteSorter[n] = n;
623 		spriteSorter ~= n;
624 		//sortSprites();
625 		
626 		spriteSorter.sort();
627 		
628 	}
629 	/**
630 	 * 
631 	 */
632 	public void replaceSprite(Bitmap16Bit s, int n){
633 
634 		if(!(s.getX == spriteSet[n].getX && s.getY == spriteSet[n].getY)){
635 			coordinates[n] = Coordinate(coordinates[n].left,coordinates[n].top,coordinates[n].left + s.getX,coordinates[n].top + s.getY);
636 		}
637 		spriteSet[n] = s;
638 	}
639 
640 	public void replaceSprite(Bitmap16Bit s, int n, int x, int y){
641 		spriteSet[n] = s;
642 		coordinates[n] = Coordinate(x,y,x+s.getX(),y+s.getY());
643 	}
644 
645 	public void replaceSprite(Bitmap16Bit s, int n, Coordinate c){
646 		spriteSet[n] = s;
647 		coordinates[n] = c;
648 	}
649 	
650 	/*public ushort getTransparencyIndex(){
651 		return transparencyIndex;
652 	}*/
653 	
654 	public void removeSprite(int n){
655 		//spriteSorter.remove(n);
656 		coordinates.remove(n);
657 		flipRegisters.remove(n);
658 		spriteSet.remove(n);
659 		int[] newSpriteSorter;
660 		for(int i; i < spriteSorter.length; i++){
661 			//writeln(0);
662 			if(spriteSorter[i] != n){
663 				newSpriteSorter ~= spriteSorter[i];
664 				
665 			}
666 		}
667 		spriteSorter = newSpriteSorter;
668 		//writeln(spriteSorter);
669 		//sortSprites();
670 	}
671 	public void moveSprite(int n, int x, int y){
672 		coordinates[n].move(x,y);
673 		callCollisionDetector(n);
674 	}
675 	public void relMoveSprite(int n, int x, int y){
676 		coordinates[n].relMove(x,y);
677 		callCollisionDetector(n);
678 	}
679 	
680 	public Bitmap16Bit[int] getSpriteSet(){
681 		return spriteSet;
682 	}
683 	
684 	public Coordinate[int] getCoordinates(){
685 		return coordinates;
686 	}
687 	
688 	public FlipRegister[int] getFlipRegisters(){
689 		return flipRegisters;
690 	}
691 	public int[int] getSpriteSorter(){
692 		return null;
693 	}
694 	
695 	private void callCollisionDetector(int n){
696 		foreach(c; collisionDetector){
697 			c.spriteMoved(n);
698 		}
699 	}
700 	
701 	public override void updateRaster(void* workpad, int pitch, ubyte[] palette, int[] threads){
702 		foreach_reverse(int i ; spriteSorter){
703 			/*foreach(int i ; spriteSet.byKey){*/
704 			if((coordinates[i].right > sX && coordinates[i].bottom > sY) && (coordinates[i].left < sX + rasterX && coordinates[i].top < sY + rasterY)) {
705 				//writeln(i);
706 				int offsetXA, offsetXB, offsetYA, offsetYB, sizeX = coordinates[i].getXSize(), offsetX = coordinates[i].left - sX;
707 				if(sX > coordinates[i].left) {offsetXA = sX - coordinates[i].left; }
708 				if(sY > coordinates[i].top) {offsetYA = sY - coordinates[i].top; }
709 				if(sX + rasterX < coordinates[i].right) {offsetXB = coordinates[i].right - rasterX; }
710 				if(sY + rasterY < coordinates[i].bottom) {offsetYB = coordinates[i].bottom - rasterY; }
711 				ushort* p0 = spriteSet[i].getPtr();
712 				for(int y = offsetYA ; y < coordinates[i].getYSize() - offsetYB ; y++){
713 					//ushort[] chunk = (flipRegisters[i] == FlipRegister.Y || flipRegisters[i] == FlipRegister.XY) ? spriteSet[i].readRowReverse(y) : spriteSet[i].readRow(y);
714 					int offsetP = sizeX * y, offsetY = (coordinates[i].top - sY + y)*pitch;
715 					int x = offsetXA;
716 					//if(x < 0) writeln(x); 
717 					if(flipRegisters[i] == FlipRegister.X || flipRegisters[i] == FlipRegister.XY){
718 						for(; x < sizeX - offsetXB ; x+=4){
719 							ushort* c = (p0 + (sizeX - x - 1) + offsetP);
720 
721 							//ushort c = chunk[chunk.length-x-1];
722 							//alphaBlend(palette[(c*4)+1],palette[(c*4)+2],palette[(c*4)+3],palette[(c*4)], workpad + (coordinates[i].xa - sX + x)*4 + (coordinates[i].ya - sY + y)*pitch);
723 							//alphaBlend(*cast(ubyte[4]*)(palette.ptr + 4 * c), workpad + (coordinates[i].xa - sX + x)*4 + (coordinates[i].ya - sY + y)*pitch);
724 							//ubyte[4] src = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
725 							//ubyte[4] *p = cast(ubyte[4]*)(workpad + (offsetX + x)*4 + offsetY);
726 							/*if(src[0] == 255){
727 								*p = src;
728 							}
729 							else if(src[0] != 0){
730 								ubyte[4] dest2 = *p;
731 								dest2[1] = to!ubyte((src[1] * src[0] + dest2[1] * (255 - src[0]))>>8);
732 								dest2[2] = to!ubyte((src[2] * src[0] + dest2[2] * (255 - src[0]))>>8);
733 								dest2[3] = to!ubyte((src[3] * src[0] + dest2[3] * (255 - src[0]))>>8);
734 								*p = dest2;
735 							}*/
736 							ubyte[16] *p = cast(ubyte[16]*)(workpad + (offsetX + x)*4 + offsetY);
737 							ubyte[16] src;
738 							//uint[4] src;
739 							*cast(ubyte[4]*)(src.ptr) = *cast(ubyte[4]*)(palette.ptr + 4 * *(c+3));
740 							*cast(ubyte[4]*)(src.ptr + 4) = *cast(ubyte[4]*)(palette.ptr + 4 * *(c+2));
741 							*cast(ubyte[4]*)(src.ptr + 8) = *cast(ubyte[4]*)(palette.ptr + 4 * *(c+1));
742 							*cast(ubyte[4]*)(src.ptr + 12) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
743 							ubyte[16] alpha = [src[0],src[0],src[0],src[0],src[4],src[4],src[4],src[4],src[8],src[8],src[8],src[8],src[12],src[12],src[12],src[12]];
744 
745 
746 							asm{
747 								//calculating alpha
748 								//pxor	XMM1, XMM1;
749 								movups	XMM0, alpha;
750 								
751 								movups	XMM1, XMM0;
752 								punpcklbw	XMM0, XMM2;
753 								punpckhbw	XMM1, XMM3;
754 								movaps	XMM6, alphaSSEConst256;
755 								movaps	XMM7, XMM6;
756 								movaps	XMM4, alphaSSEConst1;
757 								movaps	XMM5, XMM4;
758 
759 								
760 								//punpcklbw	XMM1, XMM2;
761 								
762 								paddusw	XMM4, XMM1;	//1 + alpha01
763 								paddusw	XMM5, XMM0;
764 								psubusw	XMM6, XMM1;	//256 - alpha01
765 								psubusw	XMM7, XMM0;
766 								
767 								//moving the values to their destinations
768 								mov		EBX, p[EBP];
769 								movups	XMM0, src;	//src01
770 								movups	XMM1, XMM0; //src23
771 								punpcklbw	XMM0, XMM2;
772 								punpckhbw	XMM1, XMM3;
773 								pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
774 								pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
775 								movups	XMM0, [EBX];	//dest01
776 								movups	XMM1, XMM0;		//dest23
777 								punpcklbw	XMM0, XMM2;
778 								punpckhbw	XMM1, XMM3;
779 								pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
780 								pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
781 								
782 								paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
783 								paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
784 								psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
785 								psrlw	XMM5, 8;
786 								//moving the result to its place;
787 								//pxor	MM2, MM2;
788 								packuswb	XMM4, XMM5;
789 								
790 								movups	[EBX], XMM4;
791 								
792 								//emms;
793 							}
794 						}
795 						for(; x < sizeX - offsetXB ; x++){
796 							ushort* c = (p0 + (sizeX - x - 1) + offsetP);
797 							
798 							ubyte[4] *p = cast(ubyte[4]*)(workpad + (offsetX + x)*4 + offsetY);
799 							ubyte[4] src = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
800 							ushort[4] alpha = [src[0],src[0],src[0],src[0]];
801 							asm{
802 								pxor	XMM3, XMM3;
803 								movq	XMM2, alpha;
804 								mov		EBX, p[EBP];
805 								movd	XMM0, [EBX];
806 								movd	XMM1, src;
807 								punpcklbw	XMM0, XMM3;//dest
808 								punpcklbw	XMM1, XMM3;//src
809 								//punpcklbw	XMM2, XMM3;//alpha
810 								movaps	XMM4, alphaSSEConst256;
811 								movaps	XMM5, alphaSSEConst1;
812 								
813 								paddusw XMM5, XMM2;//1+alpha
814 								psubusw	XMM4, XMM2;//256-alpha
815 								
816 								pmullw	XMM0, XMM4;//dest*(256-alpha)
817 								pmullw	XMM1, XMM5;//src*(1+alpha)
818 								paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
819 								psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
820 								//pxor	XMM7, XMM7;
821 								packuswb	XMM0, XMM3;
822 								
823 								movd	[EBX], XMM0;
824 								
825 								//pxor	XMM0, XMM0;
826 								//pxor	XMM1, XMM1;
827 								pxor	XMM2, XMM2;
828 							}
829 							
830 						}
831 					}
832 					else{ //for non flipped sprites
833 						void* pl = (workpad + (offsetX + x)*4 + offsetY);
834 						ushort* c = p0 + x + offsetP;
835 						for(; x < sizeX - offsetXB - 3 ; x+=4){
836 							//ushort* c = p0 + x + offsetP;
837 							ubyte[16] *p = cast(ubyte[16]*)pl;		//(workpad + (offsetX + x)*4 + offsetY);
838 							ubyte[16] src;
839 							*cast(ubyte[4]*)(src.ptr) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
840 							c++;
841 							*cast(ubyte[4]*)(src.ptr + 4) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
842 							c++;
843 							*cast(ubyte[4]*)(src.ptr + 8) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
844 							c++;
845 							*cast(ubyte[4]*)(src.ptr + 12) = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
846 							c++;
847 							ubyte[16] alpha = [src[12],src[12],src[12],src[12],src[8],src[8],src[8],src[8],src[4],src[4],src[4],src[4],src[0],src[0],src[0],src[0]];
848 
849 							//uint[4] src;
850 							//uint[4] alpha;
851 
852 							asm{
853 								//do a test if alpha-blending and/or blitter can avoided
854 								/*
855 								movups	XMM0, alpha;
856 								pxor 	XMM1, XMM1;
857 								pcmpeqq	XMM1, XMM0; //use packed testing of SSE to figure out if any operation can be skipped
858 								je 		endofalgorithm;
859 								movaps	XMM3, SSEUQWmaxvalue; //use further tests if blitter can be used
860 								pcmpeqq	XMM3, XMM0;
861 								pand	XMM3, XMM1;
862 								pcmpeqq XMM3, SSEUQWmaxvalue;
863 								jne		alphablend;
864 
865 								//blitter routine
866 								mov		EBX, p[EBP];
867 								movups	XMM0, src;
868 								movups	XMM1, [EBX];
869 								pxor	XMM3, XMM3;
870 								pcmpeqq	XMM3, XMM0;
871 								pand	XMM1, XMM3;
872 								por		XMM1, XMM0;
873 								movups	[EBX], XMM1;
874 								jmp 	endofalgorithm;
875 
876 							alphablend:*/
877 								//calculating alpha
878 								//pxor	XMM1, XMM1;
879 							
880 								movups	XMM0, alpha;
881 								movups	XMM1, XMM0;
882 								punpcklbw	XMM0, XMM2;
883 								punpckhbw	XMM1, XMM2;
884 								movaps	XMM6, alphaSSEConst256;
885 								movaps	XMM7, XMM6;
886 								movaps	XMM4, alphaSSEConst1;
887 								movaps	XMM5, XMM4;
888 
889 
890 								//punpcklbw	XMM1, XMM2;
891 								
892 								paddusw	XMM4, XMM0;	//1 + alpha01
893 								paddusw	XMM5, XMM1;
894 								psubusw	XMM6, XMM0;	//256 - alpha01
895 								psubusw	XMM7, XMM1;
896 
897 								//moving the values to their destinations
898 								mov		EBX, p[EBP];
899 								movups	XMM0, src;	//src01
900 								movups	XMM1, XMM0; //src23
901 								punpcklbw	XMM0, XMM2;
902 								punpckhbw	XMM1, XMM2;
903 								pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
904 								pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
905 								movups	XMM0, [EBX];	//dest01
906 								movups	XMM1, XMM0;		//dest23
907 								punpcklbw	XMM0, XMM2;
908 								punpckhbw	XMM1, XMM2;
909 								pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
910 								pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
911 
912 								paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
913 								paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
914 								psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
915 								psrlw	XMM5, 8;
916 								//moving the result to its place;
917 								//pxor	MM2, MM2;
918 								packuswb	XMM4, XMM5;
919 								
920 								movups	[EBX], XMM4;
921 
922 							//endofalgorithm:
923 
924 							}
925 							pl += 16;
926 							//c += 4;
927 							//*p = [res[0],res[2],res[4],res[6]];
928 							//ubyte[4] res = *p;
929 							//writeln(res);
930 							
931 							//}
932 						}
933 						for(; x < sizeX - offsetXB ; x++){
934 							//ushort* c = p0 + x + offsetP;
935 							
936 							ubyte[4] *p = cast(ubyte[4]*)pl;		//(workpad + (offsetX + x)*4 + offsetY);
937 							ubyte[4] src = *cast(ubyte[4]*)(palette.ptr + 4 * *c);
938 							ushort[4] alpha = [src[0],src[0],src[0],src[0]];
939 							asm{
940 								//pxor	XMM3, XMM3;
941 								movq	XMM2, alpha;
942 								mov		EBX, p[EBP];
943 								movd	XMM0, [EBX];
944 								movd	XMM1, src;
945 								punpcklbw	XMM0, XMM3;//dest
946 								punpcklbw	XMM1, XMM3;//src
947 								//punpcklbw	XMM2, XMM3;//alpha
948 								movaps	XMM4, alphaSSEConst256;
949 								movaps	XMM5, alphaSSEConst1;
950 
951 								paddusw XMM5, XMM2;//1+alpha
952 								psubusw	XMM4, XMM2;//256-alpha
953 
954 								pmullw	XMM0, XMM4;//dest*(256-alpha)
955 								pmullw	XMM1, XMM5;//src*(1+alpha)
956 								paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
957 								psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
958 								//pxor	XMM7, XMM7;
959 								packuswb	XMM0, XMM3;
960 
961 								movd	[EBX], XMM0;
962 
963 								//pxor	XMM0, XMM0;
964 								//pxor	XMM1, XMM1;
965 								pxor	XMM2, XMM2;
966 							}
967 							pl += 4;
968 							c++;
969 						}
970 					}
971 				}
972 			}
973 		}
974 	}
975 	
976 	/*public void updateRaster(Bitmap16Bit frameBuffer){
977 		//writeln(spriteSorter);
978 		foreach_reverse(int i ; spriteSorter){
979 			/*foreach(int i ; spriteSet.byKey){*/
980 			/*if((coordinates[i].right > sX && coordinates[i].bottom > sY) && (coordinates[i].left < sX + rasterX && coordinates[i].right < sY + rasterY)) {
981 				//writeln(i);
982 				int offsetXA, offsetXB, offsetYA, offsetYB;
983 				//if(sX > coordinates[i].xa) {offsetXA = sX - coordinates[i].xa; }
984 				if(sY > coordinates[i].top) {offsetYA = sY - coordinates[i].top; }
985 				//if(sX + rasterX < coordinates[i].xb) {offsetXB = sX - coordinates[i].xb - rasterX; }
986 				if(sY + rasterY < coordinates[i].bottom) {offsetYB = coordinates[i].bottom - rasterY; }
987 				for(int y = offsetYA ; y < coordinates[i].getYSize() - offsetYB ; y++){
988 					ushort[] chunk = (flipRegisters[i] == FlipRegister.Y || flipRegisters[i] == FlipRegister.XY) ? spriteSet[i].readRowReverse(y) : spriteSet[i].readRow(y);
989 					if(flipRegisters[i] == FlipRegister.X || flipRegisters[i] == FlipRegister.XY){
990 						for(int x ; x < chunk.length ; x++){
991 							if(coordinates[i].left - sX + x >= 0 && coordinates[i].left - sX + x < rasterX){
992 								if(chunk[chunk.length-x-1] != transparencyIndex) frameBuffer.writePixel(coordinates[i].left - sX + x, coordinates[i].top - sY + y, chunk[chunk.length-x-1]);
993 							}
994 						}
995 					}
996 					else{
997 						for(int x ; x < chunk.length ; x++){
998 							if(coordinates[i].left - sX + x >= 0 && coordinates[i].left - sX + x < rasterX){
999 								if(chunk[x] != transparencyIndex) frameBuffer.writePixel(coordinates[i].left - sX + x, coordinates[i].top - sY + y, chunk[x]);
1000 							}
1001 						}
1002 					}
1003 				}
1004 			}
1005 		}
1006 	}*/
1007 	
1008 	
1009 }
1010 
1011 public class SpriteLayer32Bit : Layer, ISpriteCollision, ISpriteLayer32Bit{
1012 	private Bitmap32Bit[int] spriteSet;
1013 	private Coordinate[int] coordinates;		//Use moveSprite() and relMoveSprite() instead to move sprites
1014 	private FlipRegister[int] flipRegisters;
1015 	private int[] spriteSorter;
1016 	public SpriteMovementListener[int] collisionDetector;
1017 
1018 	
1019 	public this(){
1020 		
1021 	}
1022 	
1023 	public void addSprite(Bitmap32Bit s, int n, Coordinate c){
1024 		spriteSet[n] = s;
1025 		coordinates[n] = c;
1026 		flipRegisters[n] = FlipRegister.NORM;
1027 		spriteSorter ~= n;
1028 		//sortSprites();
1029 		spriteSorter.sort();
1030 		
1031 	}
1032 	
1033 	public void addSprite(Bitmap32Bit s, int n, int x, int y){
1034 		writeln(s);
1035 		spriteSet[n] = s;
1036 		coordinates[n] = Coordinate(x,y,x+spriteSet[n].getX,y+spriteSet[n].getY);
1037 		flipRegisters[n] = FlipRegister.NORM;
1038 		//spriteSorter[n] = n;
1039 		spriteSorter ~= n;
1040 		//sortSprites();
1041 		
1042 		spriteSorter.sort();
1043 		
1044 	}
1045 
1046 	public void replaceSprite(Bitmap32Bit s, int n){}
1047 	public void replaceSprite(Bitmap32Bit s, int n, int x, int y){}
1048 	public void replaceSprite(Bitmap32Bit s, int n, Coordinate c){}
1049 	
1050 	/*public ushort getTransparencyIndex(){
1051 		return transparencyIndex;
1052 	}*/
1053 	
1054 	public void removeSprite(int n){
1055 		//spriteSorter.remove(n);
1056 		coordinates.remove(n);
1057 		flipRegisters.remove(n);
1058 		spriteSet.remove(n);
1059 		int[] newSpriteSorter;
1060 		for(int i; i < spriteSorter.length; i++){
1061 			//writeln(0);
1062 			if(spriteSorter[i] != n){
1063 				newSpriteSorter ~= spriteSorter[i];
1064 				
1065 			}
1066 		}
1067 		spriteSorter = newSpriteSorter;
1068 		//writeln(spriteSorter);
1069 		//sortSprites();
1070 	}
1071 	public void moveSprite(int n, int x, int y){
1072 		coordinates[n].move(x,y);
1073 		callCollisionDetector(n);
1074 	}
1075 	public void relMoveSprite(int n, int x, int y){
1076 		coordinates[n].relMove(x,y);
1077 		callCollisionDetector(n);
1078 	}
1079 	
1080 	public Bitmap32Bit[int] getSpriteSet(){
1081 		return spriteSet;
1082 	}
1083 	
1084 	public Coordinate[int] getCoordinates(){
1085 		return coordinates;
1086 	}
1087 	
1088 	public FlipRegister[int] getFlipRegisters(){
1089 		return flipRegisters;
1090 	}
1091 	public int[int] getSpriteSorter(){
1092 		return null;
1093 	}
1094 	
1095 	private void callCollisionDetector(int n){
1096 		foreach(c; collisionDetector){
1097 			c.spriteMoved(n);
1098 		}
1099 	}
1100 	
1101 	public override void updateRaster(void* workpad, int pitch, ubyte[] palette, int[] threads){
1102 		foreach_reverse(int i ; spriteSorter){
1103 			
1104 			if((coordinates[i].right > sX && coordinates[i].bottom > sY) && (coordinates[i].left < sX + rasterX && coordinates[i].top < sY + rasterY)) {
1105 				//writeln(i);
1106 				int offsetXA, offsetXB, offsetYA, offsetYB, sizeX = coordinates[i].getXSize(), offsetX = coordinates[i].left - sX;
1107 				if(sX > coordinates[i].left) {offsetXA = sX - coordinates[i].left; }
1108 				if(sY > coordinates[i].top) {offsetYA = sY - coordinates[i].top; }
1109 				if(sX + rasterX < coordinates[i].right) {offsetXB = coordinates[i].right - rasterX; }
1110 				if(sY + rasterY < coordinates[i].bottom) {offsetYB = coordinates[i].bottom - rasterY; }
1111 				ubyte* p0 = spriteSet[i].getPtr();
1112 				//writeln(p0);
1113 				for(int y = offsetYA ; y < coordinates[i].getYSize() - offsetYB ; y++){//for non flipped sprites
1114 					//ushort[] chunk = (flipRegisters[i] == FlipRegister.Y || flipRegisters[i] == FlipRegister.XY) ? spriteSet[i].readRowReverse(y) : spriteSet[i].readRow(y);
1115 					int offsetP = sizeX * y * 4, offsetY = (coordinates[i].top - sY + y)*pitch;
1116 					int x = offsetXA;
1117 					ubyte* c = p0 + x + offsetP;
1118 					void* pl = (workpad + (offsetX + x * 4) + offsetY);
1119 					for(; x < sizeX - offsetXB - 3 ; x+=4){
1120 						//writeln(x);
1121 						ubyte[16] *p = cast(ubyte[16]*)pl;
1122 						ubyte[16] src = *cast(ubyte[16]*)c;
1123 						//src = [src[0],src[0],src[0],src[0],src[4],src[4],src[4],src[4],src[8],src[8],src[8],src[8],src[12],src[12],src[12],src[12]];
1124 						//ubyte[16] alpha = [src[12],src[12],src[12],src[12],src[8],src[8],src[8],src[8],src[4],src[4],src[4],src[4],src[0],src[0],src[0],src[0]];
1125 						ubyte[16] alpha = [src[0],src[0],src[0],src[0],src[4],src[4],src[4],src[4],src[8],src[8],src[8],src[8],src[12],src[12],src[12],src[12]];
1126 						//ubyte[16] alpha = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255];
1127 						//uint[4] src;
1128 						//uint[4] alpha;
1129 						
1130 						asm{
1131 							//create the source
1132 							
1133 							//calculating alpha
1134 							//pxor	XMM1, XMM1;
1135 							movups	XMM0, alpha;	//a01
1136 							movups	XMM1, XMM0;		//a23
1137 							punpcklbw	XMM0, XMM2;
1138 							punpckhbw	XMM1, XMM2;
1139 							movaps	XMM6, alphaSSEConst256;
1140 							movaps	XMM7, XMM6;
1141 							movaps	XMM4, alphaSSEConst1;
1142 							movaps	XMM5, XMM4;
1143 							
1144 							
1145 							//punpcklbw	XMM1, XMM2;
1146 							
1147 							paddusw	XMM4, XMM0;	//1 + alpha01
1148 							paddusw	XMM5, XMM1;
1149 							psubusw	XMM6, XMM0;	//256 - alpha01
1150 							psubusw	XMM7, XMM1;
1151 							
1152 							//moving the values to their destinations
1153 							mov		EBX, p[EBP];
1154 							movups	XMM0, src;	//src01
1155 							movups	XMM1, XMM0; //src23
1156 							punpcklbw	XMM0, XMM2;
1157 							punpckhbw	XMM1, XMM2;
1158 							pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
1159 							pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
1160 							movups	XMM0, [EBX];	//dest01
1161 							movups	XMM1, XMM0;		//dest23
1162 							punpcklbw	XMM0, XMM2;
1163 							punpckhbw	XMM1, XMM3;
1164 							pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
1165 							pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
1166 							
1167 							paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
1168 							paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
1169 							psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
1170 							psrlw	XMM5, 8;
1171 							//moving the result to its place;
1172 							//pxor	MM2, MM2;
1173 							packuswb	XMM4, XMM5;
1174 							
1175 							movups	[EBX], XMM4;
1176 							
1177 							//emms;
1178 						}
1179 						//*p = [res[0],res[2],res[4],res[6]];
1180 						//ubyte[4] res = *p;
1181 						//writeln(res);
1182 						pl += 16;
1183 						c += 16;
1184 						//}
1185 					}
1186 					for(; x < sizeX - offsetXB ; x++){
1187 						//ubyte* c = p0 + x + offsetP;
1188 						
1189 						ubyte[4] *p = cast(ubyte[4]*)pl;  //(workpad + (offsetX + x)*4 + offsetY);
1190 						ubyte[4] src = *cast(ubyte[4]*)c;   //(c);
1191 						ushort[4] alpha = [src[0],src[0],src[0],src[0]];
1192 						asm{
1193 							//pxor	XMM3, XMM3;
1194 							movq	XMM2, alpha;
1195 							mov		EBX, p[EBP];
1196 							movd	XMM0, [EBX];
1197 							movd	XMM1, src;
1198 							punpcklbw	XMM0, XMM3;//dest
1199 							punpcklbw	XMM1, XMM3;//src
1200 							
1201 							movaps	XMM4, alphaSSEConst256;
1202 							movaps	XMM5, alphaSSEConst1;
1203 							
1204 							paddusw XMM5, XMM2;//1+alpha
1205 							psubusw	XMM4, XMM2;//256-alpha
1206 							
1207 							pmullw	XMM0, XMM4;//dest*(256-alpha)
1208 							pmullw	XMM1, XMM5;//src*(1+alpha)
1209 							paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
1210 							psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
1211 							packuswb	XMM0, XMM3;
1212 							movd	[EBX], XMM0;
1213 							pxor	XMM2, XMM2;
1214 						}
1215 						pl+=4;
1216 						c+=4;
1217 					}
1218 
1219 				}
1220 			}
1221 		}
1222 	}
1223 }